pensiev 0.25.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memos/__init__.py +6 -0
- memos/cmds/__init__.py +0 -0
- memos/cmds/library.py +1289 -0
- memos/cmds/plugin.py +96 -0
- memos/commands.py +865 -0
- memos/config.py +225 -0
- memos/crud.py +605 -0
- memos/databases/__init__.py +0 -0
- memos/databases/initializers.py +481 -0
- memos/dataset_extractor_for_florence.py +165 -0
- memos/dataset_extractor_for_internvl2.py +192 -0
- memos/default_config.yaml +88 -0
- memos/embedding.py +129 -0
- memos/frame_extractor.py +53 -0
- memos/logging_config.py +35 -0
- memos/main.py +104 -0
- memos/migrations/alembic/README +1 -0
- memos/migrations/alembic/__pycache__/env.cpython-310.pyc +0 -0
- memos/migrations/alembic/env.py +108 -0
- memos/migrations/alembic/script.py.mako +30 -0
- memos/migrations/alembic/versions/00904ac8c6fc_add_indexes_to_entitymodel.py +63 -0
- memos/migrations/alembic/versions/04acdaf75664_add_indices_to_entitytags_and_metadata.py +86 -0
- memos/migrations/alembic/versions/12504c5b1d3c_add_extra_columns_for_embedding.py +67 -0
- memos/migrations/alembic/versions/31a1ad0e10b3_add_entity_plugin_status.py +71 -0
- memos/migrations/alembic/versions/__pycache__/00904ac8c6fc_add_indexes_to_entitymodel.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/04acdaf75664_add_indices_to_entitytags_and_metadata.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/12504c5b1d3c_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/20f5ecab014d_add_entity_plugin_status.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/31a1ad0e10b3_add_entity_plugin_status.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/4fcb062c5128_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/d10c55fbb7d2_add_index_for_entity_file_type_group_.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/f8f158182416_add_active_app_index.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/d10c55fbb7d2_add_index_for_entity_file_type_group_.py +44 -0
- memos/migrations/alembic/versions/f8f158182416_add_active_app_index.py +75 -0
- memos/migrations/alembic.ini +116 -0
- memos/migrations.py +19 -0
- memos/models.py +199 -0
- memos/plugins/__init__.py +0 -0
- memos/plugins/ocr/__init__.py +0 -0
- memos/plugins/ocr/main.py +251 -0
- memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx +0 -0
- memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx +0 -0
- memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx +0 -0
- memos/plugins/ocr/ppocr-gpu.yaml +43 -0
- memos/plugins/ocr/ppocr.yaml +44 -0
- memos/plugins/ocr/server.py +227 -0
- memos/plugins/ocr/temp_ppocr.yaml +42 -0
- memos/plugins/vlm/__init__.py +0 -0
- memos/plugins/vlm/main.py +251 -0
- memos/prepare_dataset.py +107 -0
- memos/process_webp.py +55 -0
- memos/read_metadata.py +32 -0
- memos/record.py +358 -0
- memos/schemas.py +289 -0
- memos/search.py +1198 -0
- memos/server.py +883 -0
- memos/shotsum.py +105 -0
- memos/shotsum_with_ocr.py +145 -0
- memos/simple_tokenizer/dict/README.md +31 -0
- memos/simple_tokenizer/dict/hmm_model.utf8 +34 -0
- memos/simple_tokenizer/dict/idf.utf8 +258826 -0
- memos/simple_tokenizer/dict/jieba.dict.utf8 +348982 -0
- memos/simple_tokenizer/dict/pos_dict/char_state_tab.utf8 +6653 -0
- memos/simple_tokenizer/dict/pos_dict/prob_emit.utf8 +166 -0
- memos/simple_tokenizer/dict/pos_dict/prob_start.utf8 +259 -0
- memos/simple_tokenizer/dict/pos_dict/prob_trans.utf8 +5222 -0
- memos/simple_tokenizer/dict/stop_words.utf8 +1534 -0
- memos/simple_tokenizer/dict/user.dict.utf8 +4 -0
- memos/simple_tokenizer/linux/libsimple.so +0 -0
- memos/simple_tokenizer/macos/libsimple.dylib +0 -0
- memos/simple_tokenizer/windows/simple.dll +0 -0
- memos/static/_app/immutable/assets/0.e250c031.css +1 -0
- memos/static/_app/immutable/assets/_layout.e7937cfe.css +1 -0
- memos/static/_app/immutable/chunks/index.5c08976b.js +1 -0
- memos/static/_app/immutable/chunks/index.60ee613b.js +4 -0
- memos/static/_app/immutable/chunks/runtime.a7926cf6.js +5 -0
- memos/static/_app/immutable/chunks/scheduler.5c1cff6e.js +1 -0
- memos/static/_app/immutable/chunks/singletons.583bdf4e.js +1 -0
- memos/static/_app/immutable/entry/app.666c1643.js +1 -0
- memos/static/_app/immutable/entry/start.aed5c701.js +3 -0
- memos/static/_app/immutable/nodes/0.5862ea38.js +7 -0
- memos/static/_app/immutable/nodes/1.35378a5e.js +1 -0
- memos/static/_app/immutable/nodes/2.1ccf9ea5.js +81 -0
- memos/static/_app/version.json +1 -0
- memos/static/app.html +36 -0
- memos/static/favicon.png +0 -0
- memos/static/logos/memos_logo_1024.png +0 -0
- memos/static/logos/memos_logo_1024@2x.png +0 -0
- memos/static/logos/memos_logo_128.png +0 -0
- memos/static/logos/memos_logo_128@2x.png +0 -0
- memos/static/logos/memos_logo_16.png +0 -0
- memos/static/logos/memos_logo_16@2x.png +0 -0
- memos/static/logos/memos_logo_256.png +0 -0
- memos/static/logos/memos_logo_256@2x.png +0 -0
- memos/static/logos/memos_logo_32.png +0 -0
- memos/static/logos/memos_logo_32@2x.png +0 -0
- memos/static/logos/memos_logo_512.png +0 -0
- memos/static/logos/memos_logo_512@2x.png +0 -0
- memos/static/logos/memos_logo_64.png +0 -0
- memos/static/logos/memos_logo_64@2x.png +0 -0
- memos/test_server.py +802 -0
- memos/utils.py +49 -0
- memos_ml_backends/florence2_server.py +176 -0
- memos_ml_backends/qwen2vl_server.py +182 -0
- memos_ml_backends/schemas.py +48 -0
- pensiev-0.25.5.dist-info/LICENSE +201 -0
- pensiev-0.25.5.dist-info/METADATA +541 -0
- pensiev-0.25.5.dist-info/RECORD +111 -0
- pensiev-0.25.5.dist-info/WHEEL +5 -0
- pensiev-0.25.5.dist-info/entry_points.txt +2 -0
- pensiev-0.25.5.dist-info/top_level.txt +2 -0
memos/cmds/library.py
ADDED
@@ -0,0 +1,1289 @@
|
|
1
|
+
# Standard library imports
|
2
|
+
import time
|
3
|
+
import math
|
4
|
+
import re
|
5
|
+
import os
|
6
|
+
import threading
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import logging.config
|
10
|
+
from pathlib import Path
|
11
|
+
from datetime import datetime
|
12
|
+
from enum import Enum
|
13
|
+
from typing import List, Tuple, Dict, Any, Optional, Set
|
14
|
+
from functools import lru_cache
|
15
|
+
from collections import defaultdict, deque
|
16
|
+
|
17
|
+
# Third-party imports
|
18
|
+
import typer
|
19
|
+
import httpx
|
20
|
+
from tqdm import tqdm
|
21
|
+
from tabulate import tabulate
|
22
|
+
import psutil
|
23
|
+
from watchdog.observers import Observer
|
24
|
+
from watchdog.events import FileSystemEventHandler
|
25
|
+
from concurrent.futures import ThreadPoolExecutor
|
26
|
+
|
27
|
+
# Local imports
|
28
|
+
from memos.config import settings
|
29
|
+
from memos.utils import get_image_metadata
|
30
|
+
from memos.schemas import MetadataSource
|
31
|
+
from memos.logging_config import LOGGING_CONFIG
|
32
|
+
|
33
|
+
|
34
|
+
logging.config.dictConfig(LOGGING_CONFIG)
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
lib_app = typer.Typer()
|
38
|
+
|
39
|
+
file_detector = None
|
40
|
+
|
41
|
+
IS_THUMBNAIL = "is_thumbnail"
|
42
|
+
|
43
|
+
BASE_URL = settings.server_endpoint
|
44
|
+
|
45
|
+
include_files = [".jpg", ".jpeg", ".png", ".webp"]
|
46
|
+
|
47
|
+
|
48
|
+
class FileStatus(Enum):
|
49
|
+
UPDATED = "updated"
|
50
|
+
ADDED = "added"
|
51
|
+
|
52
|
+
|
53
|
+
def format_timestamp(timestamp):
|
54
|
+
if isinstance(timestamp, str):
|
55
|
+
return timestamp
|
56
|
+
return datetime.fromtimestamp(timestamp).replace(tzinfo=None).isoformat()
|
57
|
+
|
58
|
+
|
59
|
+
def init_file_detector():
|
60
|
+
"""Initialize the global file detector if not already initialized"""
|
61
|
+
global file_detector
|
62
|
+
if file_detector is None:
|
63
|
+
from magika import Magika
|
64
|
+
|
65
|
+
file_detector = Magika()
|
66
|
+
return file_detector
|
67
|
+
|
68
|
+
|
69
|
+
def get_file_type(file_path):
|
70
|
+
"""Get file type using lazy-loaded detector"""
|
71
|
+
detector = init_file_detector()
|
72
|
+
file_result = detector.identify_path(file_path)
|
73
|
+
return file_result.output.ct_label, file_result.output.group
|
74
|
+
|
75
|
+
|
76
|
+
def display_libraries(libraries):
|
77
|
+
table = []
|
78
|
+
for library in libraries:
|
79
|
+
table.append(
|
80
|
+
[
|
81
|
+
library["id"],
|
82
|
+
library["name"],
|
83
|
+
"\n".join(
|
84
|
+
f"{folder['id']}: {folder['path']}" for folder in library["folders"]
|
85
|
+
),
|
86
|
+
"\n".join(
|
87
|
+
f"{plugin['id']}: {plugin['name']} {plugin['webhook_url']}"
|
88
|
+
for plugin in library["plugins"]
|
89
|
+
),
|
90
|
+
]
|
91
|
+
)
|
92
|
+
|
93
|
+
print(
|
94
|
+
tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain")
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
@lib_app.command("ls")
|
99
|
+
def ls():
|
100
|
+
response = httpx.get(f"{BASE_URL}/libraries")
|
101
|
+
libraries = response.json()
|
102
|
+
display_libraries(libraries)
|
103
|
+
|
104
|
+
|
105
|
+
@lib_app.command("create")
|
106
|
+
def add(name: str, folders: List[str]):
|
107
|
+
absolute_folders = []
|
108
|
+
for folder in folders:
|
109
|
+
folder_path = Path(folder).resolve()
|
110
|
+
absolute_folders.append(
|
111
|
+
{
|
112
|
+
"path": str(folder_path),
|
113
|
+
"last_modified_at": datetime.fromtimestamp(
|
114
|
+
folder_path.stat().st_mtime
|
115
|
+
).isoformat(),
|
116
|
+
}
|
117
|
+
)
|
118
|
+
|
119
|
+
response = httpx.post(
|
120
|
+
f"{BASE_URL}/libraries", json={"name": name, "folders": absolute_folders}
|
121
|
+
)
|
122
|
+
if 200 <= response.status_code < 300:
|
123
|
+
print("Library created successfully")
|
124
|
+
else:
|
125
|
+
print(f"Failed to create library: {response.status_code} - {response.text}")
|
126
|
+
|
127
|
+
|
128
|
+
@lib_app.command("add-folder")
|
129
|
+
def add_folder(library_id: int, folders: List[str]):
|
130
|
+
absolute_folders = []
|
131
|
+
for folder in folders:
|
132
|
+
folder_path = Path(folder).resolve()
|
133
|
+
absolute_folders.append(
|
134
|
+
{
|
135
|
+
"path": str(folder_path),
|
136
|
+
"last_modified_at": datetime.fromtimestamp(
|
137
|
+
folder_path.stat().st_mtime
|
138
|
+
).isoformat(),
|
139
|
+
}
|
140
|
+
)
|
141
|
+
|
142
|
+
response = httpx.post(
|
143
|
+
f"{BASE_URL}/libraries/{library_id}/folders",
|
144
|
+
json={"folders": absolute_folders},
|
145
|
+
)
|
146
|
+
if 200 <= response.status_code < 300:
|
147
|
+
print("Folders added successfully")
|
148
|
+
library = response.json()
|
149
|
+
display_libraries([library])
|
150
|
+
else:
|
151
|
+
print(f"Failed to add folders: {response.status_code} - {response.text}")
|
152
|
+
|
153
|
+
|
154
|
+
@lib_app.command("show")
|
155
|
+
def show(library_id: int):
|
156
|
+
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
157
|
+
if response.status_code == 200:
|
158
|
+
library = response.json()
|
159
|
+
display_libraries([library])
|
160
|
+
else:
|
161
|
+
print(f"Failed to retrieve library: {response.status_code} - {response.text}")
|
162
|
+
|
163
|
+
|
164
|
+
def is_temp_file(filename):
|
165
|
+
return (
|
166
|
+
filename.startswith(".")
|
167
|
+
or filename.startswith("tmp")
|
168
|
+
or filename.startswith("temp")
|
169
|
+
)
|
170
|
+
|
171
|
+
|
172
|
+
async def loop_files(library, folder, folder_path, force, plugins, batch_size):
|
173
|
+
"""
|
174
|
+
Process files in the folder
|
175
|
+
|
176
|
+
Args:
|
177
|
+
library: Library object
|
178
|
+
folder: Folder information
|
179
|
+
folder_path: Folder path
|
180
|
+
force: Whether to force update
|
181
|
+
plugins: List of plugins
|
182
|
+
batch_size: Batch size
|
183
|
+
|
184
|
+
Returns:
|
185
|
+
Tuple[int, int, int]: (Number of files added, Number of files updated, Number of files deleted)
|
186
|
+
"""
|
187
|
+
updated_file_count = 0
|
188
|
+
added_file_count = 0
|
189
|
+
deleted_file_count = 0
|
190
|
+
semaphore = asyncio.Semaphore(batch_size)
|
191
|
+
|
192
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
193
|
+
# 1. Collect candidate files
|
194
|
+
candidate_files = await collect_candidate_files(folder_path)
|
195
|
+
scanned_files = set(candidate_files)
|
196
|
+
|
197
|
+
# 2. Process file batches
|
198
|
+
added_file_count, updated_file_count = await process_file_batches(
|
199
|
+
client,
|
200
|
+
library,
|
201
|
+
folder,
|
202
|
+
candidate_files,
|
203
|
+
force,
|
204
|
+
plugins,
|
205
|
+
semaphore,
|
206
|
+
)
|
207
|
+
|
208
|
+
# 3. Check for deleted files
|
209
|
+
deleted_file_count = await check_deleted_files(
|
210
|
+
client, library.get('id'), folder, folder_path, scanned_files
|
211
|
+
)
|
212
|
+
|
213
|
+
return added_file_count, updated_file_count, deleted_file_count
|
214
|
+
|
215
|
+
|
216
|
+
@lib_app.command("scan")
|
217
|
+
def scan(
|
218
|
+
library_id: int,
|
219
|
+
path: str = typer.Argument(None, help="Path to scan within the library"),
|
220
|
+
force: bool = False,
|
221
|
+
plugins: List[int] = typer.Option(None, "--plugin", "-p"),
|
222
|
+
folders: List[int] = typer.Option(None, "--folder", "-f"),
|
223
|
+
batch_size: int = typer.Option(
|
224
|
+
1, "--batch-size", "-bs", help="Batch size for processing files"
|
225
|
+
),
|
226
|
+
):
|
227
|
+
# Check if both path and folders are provided
|
228
|
+
if path and folders:
|
229
|
+
print("Error: You cannot specify both a path and folders at the same time.")
|
230
|
+
return
|
231
|
+
|
232
|
+
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
233
|
+
if response.status_code != 200:
|
234
|
+
print(f"Failed to retrieve library: {response.status_code} - {response.text}")
|
235
|
+
return
|
236
|
+
|
237
|
+
library = response.json()
|
238
|
+
total_files_added = 0
|
239
|
+
total_files_updated = 0
|
240
|
+
total_files_deleted = 0
|
241
|
+
|
242
|
+
# Filter folders if the folders parameter is provided
|
243
|
+
if folders:
|
244
|
+
library_folders = [
|
245
|
+
folder for folder in library["folders"] if folder["id"] in folders
|
246
|
+
]
|
247
|
+
else:
|
248
|
+
library_folders = library["folders"]
|
249
|
+
|
250
|
+
# Check if a specific path is provided
|
251
|
+
if path:
|
252
|
+
path = Path(path).expanduser().resolve()
|
253
|
+
# Check if the path is a folder or a subdirectory of a library folder
|
254
|
+
folder = next(
|
255
|
+
(
|
256
|
+
folder
|
257
|
+
for folder in library_folders
|
258
|
+
if path.is_relative_to(Path(folder["path"]).resolve())
|
259
|
+
),
|
260
|
+
None,
|
261
|
+
)
|
262
|
+
if not folder:
|
263
|
+
print(f"Error: The path {path} is not part of any folder in the library.")
|
264
|
+
return
|
265
|
+
# Only scan the specified path
|
266
|
+
library_folders = [{"id": folder["id"], "path": str(path)}]
|
267
|
+
|
268
|
+
for folder in library_folders:
|
269
|
+
folder_path = Path(folder["path"])
|
270
|
+
if not folder_path.exists() or not folder_path.is_dir():
|
271
|
+
tqdm.write(f"Folder does not exist or is not a directory: {folder_path}")
|
272
|
+
continue
|
273
|
+
|
274
|
+
added_file_count, updated_file_count, deleted_file_count = asyncio.run(
|
275
|
+
loop_files(library, folder, folder_path, force, plugins, batch_size)
|
276
|
+
)
|
277
|
+
total_files_added += added_file_count
|
278
|
+
total_files_updated += updated_file_count
|
279
|
+
total_files_deleted += deleted_file_count
|
280
|
+
|
281
|
+
print(f"Total files added: {total_files_added}")
|
282
|
+
print(f"Total files updated: {total_files_updated}")
|
283
|
+
print(f"Total files deleted: {total_files_deleted}")
|
284
|
+
|
285
|
+
|
286
|
+
async def add_entity(
|
287
|
+
client: httpx.AsyncClient,
|
288
|
+
semaphore: asyncio.Semaphore,
|
289
|
+
library_id,
|
290
|
+
plugins,
|
291
|
+
new_entity,
|
292
|
+
) -> Tuple[FileStatus, bool, httpx.Response]:
|
293
|
+
async with semaphore:
|
294
|
+
MAX_RETRIES = 3
|
295
|
+
RETRY_DELAY = 2.0
|
296
|
+
for attempt in range(MAX_RETRIES):
|
297
|
+
try:
|
298
|
+
post_response = await client.post(
|
299
|
+
f"{BASE_URL}/libraries/{library_id}/entities",
|
300
|
+
json=new_entity,
|
301
|
+
params=(
|
302
|
+
{"plugins": plugins, "update_index": "true"}
|
303
|
+
if plugins
|
304
|
+
else {"update_index": "true"}
|
305
|
+
),
|
306
|
+
timeout=60,
|
307
|
+
)
|
308
|
+
if 200 <= post_response.status_code < 300:
|
309
|
+
return new_entity["filepath"], FileStatus.ADDED, True, post_response
|
310
|
+
else:
|
311
|
+
return (
|
312
|
+
new_entity["filepath"],
|
313
|
+
FileStatus.ADDED,
|
314
|
+
False,
|
315
|
+
post_response,
|
316
|
+
)
|
317
|
+
except Exception as e:
|
318
|
+
logging.error(
|
319
|
+
f"Error while adding entity (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
|
320
|
+
)
|
321
|
+
if attempt < MAX_RETRIES - 1:
|
322
|
+
await asyncio.sleep(RETRY_DELAY)
|
323
|
+
else:
|
324
|
+
return new_entity["filepath"], FileStatus.ADDED, False, None
|
325
|
+
|
326
|
+
|
327
|
+
async def update_entity(
|
328
|
+
client: httpx.AsyncClient,
|
329
|
+
semaphore: asyncio.Semaphore,
|
330
|
+
plugins,
|
331
|
+
new_entity,
|
332
|
+
existing_entity,
|
333
|
+
) -> Tuple[FileStatus, bool, httpx.Response]:
|
334
|
+
MAX_RETRIES = 3
|
335
|
+
RETRY_DELAY = 2.0
|
336
|
+
async with semaphore:
|
337
|
+
for attempt in range(MAX_RETRIES):
|
338
|
+
try:
|
339
|
+
update_response = await client.put(
|
340
|
+
f"{BASE_URL}/entities/{existing_entity['id']}",
|
341
|
+
json=new_entity,
|
342
|
+
params={
|
343
|
+
"trigger_webhooks_flag": "true",
|
344
|
+
"update_index": "true",
|
345
|
+
**({"plugins": plugins} if plugins else {}),
|
346
|
+
},
|
347
|
+
timeout=60,
|
348
|
+
)
|
349
|
+
if 200 <= update_response.status_code < 300:
|
350
|
+
return (
|
351
|
+
new_entity["filepath"],
|
352
|
+
FileStatus.UPDATED,
|
353
|
+
True,
|
354
|
+
update_response,
|
355
|
+
)
|
356
|
+
else:
|
357
|
+
return (
|
358
|
+
new_entity["filepath"],
|
359
|
+
FileStatus.UPDATED,
|
360
|
+
False,
|
361
|
+
update_response,
|
362
|
+
)
|
363
|
+
except Exception as e:
|
364
|
+
logging.error(
|
365
|
+
f"Error while updating entity {existing_entity['id']} (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
|
366
|
+
)
|
367
|
+
if attempt < MAX_RETRIES - 1:
|
368
|
+
await asyncio.sleep(RETRY_DELAY)
|
369
|
+
else:
|
370
|
+
return new_entity["filepath"], FileStatus.UPDATED, False, None
|
371
|
+
|
372
|
+
|
373
|
+
@lib_app.command("reindex")
|
374
|
+
def reindex(
|
375
|
+
library_id: int,
|
376
|
+
folders: List[int] = typer.Option(None, "--folder", "-f"),
|
377
|
+
force: bool = typer.Option(
|
378
|
+
False, "--force", help="Force recreate FTS and vector tables before reindexing"
|
379
|
+
),
|
380
|
+
batch_size: int = typer.Option(
|
381
|
+
1, "--batch-size", "-bs", help="Batch size for processing entities"
|
382
|
+
),
|
383
|
+
):
|
384
|
+
print(f"Reindexing library {library_id}")
|
385
|
+
|
386
|
+
from memos.databases.initializers import recreate_fts_and_vec_tables
|
387
|
+
|
388
|
+
# Get the library
|
389
|
+
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
390
|
+
if response.status_code != 200:
|
391
|
+
print(f"Failed to get library: {response.status_code} - {response.text}")
|
392
|
+
return
|
393
|
+
|
394
|
+
library = response.json()
|
395
|
+
scanned_entities = set()
|
396
|
+
|
397
|
+
# Filter folders if the folders parameter is provided
|
398
|
+
if folders:
|
399
|
+
library_folders = [
|
400
|
+
folder for folder in library["folders"] if folder["id"] in folders
|
401
|
+
]
|
402
|
+
else:
|
403
|
+
library_folders = library["folders"]
|
404
|
+
|
405
|
+
if force:
|
406
|
+
print("Force flag is set. Recreating FTS and vector tables...")
|
407
|
+
if not recreate_fts_and_vec_tables(settings):
|
408
|
+
return
|
409
|
+
print("FTS and vector tables have been recreated.")
|
410
|
+
|
411
|
+
with httpx.Client() as client:
|
412
|
+
total_entities = 0
|
413
|
+
|
414
|
+
# Get total entity count for all folders
|
415
|
+
for folder in library_folders:
|
416
|
+
response = client.get(
|
417
|
+
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
418
|
+
params={"limit": 1, "offset": 0},
|
419
|
+
)
|
420
|
+
if response.status_code == 200:
|
421
|
+
total_entities += int(response.headers.get("X-Total-Count", 0))
|
422
|
+
else:
|
423
|
+
print(
|
424
|
+
f"Failed to get entity count for folder {folder['id']}: {response.status_code} - {response.text}"
|
425
|
+
)
|
426
|
+
|
427
|
+
# Now process entities with a progress bar
|
428
|
+
with tqdm(total=total_entities, desc="Reindexing entities") as pbar:
|
429
|
+
for folder in library_folders:
|
430
|
+
print(f"Processing folder: {folder['id']}")
|
431
|
+
|
432
|
+
# List all entities in the folder
|
433
|
+
limit = 200
|
434
|
+
offset = 0
|
435
|
+
while True:
|
436
|
+
entities_response = client.get(
|
437
|
+
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
438
|
+
params={"limit": limit, "offset": offset},
|
439
|
+
)
|
440
|
+
if entities_response.status_code != 200:
|
441
|
+
print(
|
442
|
+
f"Failed to get entities: {entities_response.status_code} - {entities_response.text}"
|
443
|
+
)
|
444
|
+
break
|
445
|
+
|
446
|
+
entities = entities_response.json()
|
447
|
+
if not entities:
|
448
|
+
break
|
449
|
+
|
450
|
+
# Collect entity IDs to be processed
|
451
|
+
entity_ids = [
|
452
|
+
entity["id"]
|
453
|
+
for entity in entities
|
454
|
+
if entity["id"] not in scanned_entities
|
455
|
+
]
|
456
|
+
|
457
|
+
# Process in batches
|
458
|
+
for i in range(0, len(entity_ids), batch_size):
|
459
|
+
batch_ids = entity_ids[i : i + batch_size]
|
460
|
+
if batch_ids:
|
461
|
+
batch_response = client.post(
|
462
|
+
f"{BASE_URL}/entities/batch-index",
|
463
|
+
json={"entity_ids": batch_ids},
|
464
|
+
timeout=60,
|
465
|
+
)
|
466
|
+
if batch_response.status_code != 204:
|
467
|
+
print(
|
468
|
+
f"Failed to update batch: {batch_response.status_code} - {batch_response.text}"
|
469
|
+
)
|
470
|
+
pbar.update(len(batch_ids))
|
471
|
+
scanned_entities.update(batch_ids)
|
472
|
+
|
473
|
+
offset += limit
|
474
|
+
|
475
|
+
if folders:
|
476
|
+
print(f"Reindexing completed for library {library_id} with folders: {folders}")
|
477
|
+
else:
|
478
|
+
print(f"Reindexing completed for library {library_id}")
|
479
|
+
|
480
|
+
|
481
|
+
def has_entity_changes(new_entity: dict, existing_entity: dict) -> bool:
|
482
|
+
"""
|
483
|
+
Compare new_entity with existing_entity to determine if there are actual changes.
|
484
|
+
Returns True if there are differences, False otherwise.
|
485
|
+
"""
|
486
|
+
# Compare basic fields
|
487
|
+
basic_fields = [
|
488
|
+
"filename",
|
489
|
+
"filepath",
|
490
|
+
"size",
|
491
|
+
"file_created_at",
|
492
|
+
"file_last_modified_at",
|
493
|
+
"file_type",
|
494
|
+
"file_type_group",
|
495
|
+
]
|
496
|
+
|
497
|
+
for field in basic_fields:
|
498
|
+
if new_entity.get(field) != existing_entity.get(field):
|
499
|
+
return True
|
500
|
+
|
501
|
+
# Compare metadata entries
|
502
|
+
new_metadata = {
|
503
|
+
(entry["key"], entry["value"])
|
504
|
+
for entry in new_entity.get("metadata_entries", [])
|
505
|
+
}
|
506
|
+
existing_metadata = {
|
507
|
+
(entry["key"], entry["value"])
|
508
|
+
for entry in existing_entity.get("metadata_entries", [])
|
509
|
+
}
|
510
|
+
if new_metadata != existing_metadata:
|
511
|
+
return True
|
512
|
+
|
513
|
+
# Compare tags
|
514
|
+
new_tags = set(new_entity.get("tags", []))
|
515
|
+
existing_tags = {tag["name"] for tag in existing_entity.get("tags", [])}
|
516
|
+
if new_tags != existing_tags:
|
517
|
+
return True
|
518
|
+
|
519
|
+
return False
|
520
|
+
|
521
|
+
|
522
|
+
@lib_app.command("sync")
|
523
|
+
def sync(
|
524
|
+
library_id: int,
|
525
|
+
filepath: str,
|
526
|
+
force: bool = typer.Option(
|
527
|
+
False, "--force", "-f", help="Force update the file even if it hasn't changed"
|
528
|
+
),
|
529
|
+
without_webhooks: bool = typer.Option(
|
530
|
+
False, "--no-plugins", help="Disable plugin triggers", is_flag=True
|
531
|
+
),
|
532
|
+
):
|
533
|
+
"""
|
534
|
+
Sync a specific file with the library.
|
535
|
+
"""
|
536
|
+
# 1. Get library by id and check if it exists
|
537
|
+
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
538
|
+
if response.status_code != 200:
|
539
|
+
typer.echo(f"Error: Library with id {library_id} not found.")
|
540
|
+
raise typer.Exit(code=1)
|
541
|
+
|
542
|
+
library = response.json()
|
543
|
+
|
544
|
+
# Convert filepath to absolute path
|
545
|
+
file_path = Path(filepath).resolve()
|
546
|
+
|
547
|
+
if not file_path.is_file():
|
548
|
+
typer.echo(f"Error: File {file_path} does not exist.")
|
549
|
+
raise typer.Exit(code=1)
|
550
|
+
|
551
|
+
# 2. Check if the file exists in the library
|
552
|
+
response = httpx.get(
|
553
|
+
f"{BASE_URL}/libraries/{library_id}/entities/by-filepath",
|
554
|
+
params={"filepath": str(file_path)},
|
555
|
+
)
|
556
|
+
|
557
|
+
file_stat = file_path.stat()
|
558
|
+
file_type, file_type_group = get_file_type(file_path)
|
559
|
+
|
560
|
+
new_entity = {
|
561
|
+
"filename": file_path.name,
|
562
|
+
"filepath": str(file_path),
|
563
|
+
"size": file_stat.st_size,
|
564
|
+
"file_created_at": format_timestamp(file_stat.st_ctime),
|
565
|
+
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
|
566
|
+
"file_type": file_type,
|
567
|
+
"file_type_group": file_type_group,
|
568
|
+
}
|
569
|
+
|
570
|
+
# Handle metadata
|
571
|
+
is_thumbnail = False
|
572
|
+
if file_type_group == "image":
|
573
|
+
metadata = get_image_metadata(file_path)
|
574
|
+
if metadata:
|
575
|
+
if "active_window" in metadata and "active_app" not in metadata:
|
576
|
+
metadata["active_app"] = metadata["active_window"].split(" - ")[0]
|
577
|
+
new_entity["metadata_entries"] = [
|
578
|
+
{
|
579
|
+
"key": key,
|
580
|
+
"value": str(value),
|
581
|
+
"source": MetadataSource.SYSTEM_GENERATED.value,
|
582
|
+
"data_type": (
|
583
|
+
"number" if isinstance(value, (int, float)) else "text"
|
584
|
+
),
|
585
|
+
}
|
586
|
+
for key, value in metadata.items()
|
587
|
+
if key != IS_THUMBNAIL
|
588
|
+
]
|
589
|
+
if "active_app" in metadata:
|
590
|
+
new_entity.setdefault("tags", []).append(metadata["active_app"])
|
591
|
+
is_thumbnail = metadata.get(IS_THUMBNAIL, False)
|
592
|
+
|
593
|
+
if is_thumbnail:
|
594
|
+
typer.echo(f"Skipping thumbnail file: {file_path}")
|
595
|
+
return
|
596
|
+
|
597
|
+
if response.status_code == 200:
|
598
|
+
# File exists, update it
|
599
|
+
existing_entity = response.json()
|
600
|
+
new_entity["folder_id"] = existing_entity["folder_id"]
|
601
|
+
|
602
|
+
if is_thumbnail:
|
603
|
+
new_entity["file_created_at"] = existing_entity["file_created_at"]
|
604
|
+
new_entity["file_last_modified_at"] = existing_entity[
|
605
|
+
"file_last_modified_at"
|
606
|
+
]
|
607
|
+
new_entity["file_type"] = existing_entity["file_type"]
|
608
|
+
new_entity["file_type_group"] = existing_entity["file_type_group"]
|
609
|
+
new_entity["size"] = existing_entity["size"]
|
610
|
+
|
611
|
+
if not force:
|
612
|
+
# Merge existing metadata with new metadata
|
613
|
+
new_metadata_keys = {
|
614
|
+
entry["key"] for entry in new_entity.get("metadata_entries", [])
|
615
|
+
}
|
616
|
+
for existing_entry in existing_entity.get("metadata_entries", []):
|
617
|
+
if existing_entry["key"] not in new_metadata_keys:
|
618
|
+
new_entity["metadata_entries"].append(existing_entry)
|
619
|
+
|
620
|
+
# Merge existing tags with new tags
|
621
|
+
existing_tags = {tag["name"] for tag in existing_entity.get("tags", [])}
|
622
|
+
new_tags = set(new_entity.get("tags", []))
|
623
|
+
merged_tags = new_tags.union(existing_tags)
|
624
|
+
new_entity["tags"] = list(merged_tags)
|
625
|
+
|
626
|
+
# Only update if there are actual changes or force flag is set
|
627
|
+
if force or has_entity_changes(new_entity, existing_entity):
|
628
|
+
update_response = httpx.put(
|
629
|
+
f"{BASE_URL}/entities/{existing_entity['id']}",
|
630
|
+
json=new_entity,
|
631
|
+
params={
|
632
|
+
"trigger_webhooks_flag": str(not without_webhooks).lower(),
|
633
|
+
"update_index": "true",
|
634
|
+
},
|
635
|
+
timeout=60,
|
636
|
+
)
|
637
|
+
if update_response.status_code == 200:
|
638
|
+
typer.echo(f"Updated file: {file_path}")
|
639
|
+
else:
|
640
|
+
typer.echo(
|
641
|
+
f"Error updating file: {update_response.status_code} - {update_response.text}"
|
642
|
+
)
|
643
|
+
else:
|
644
|
+
typer.echo(f"File {file_path} is up to date. No changes detected.")
|
645
|
+
|
646
|
+
else:
|
647
|
+
# 3. File doesn't exist, check if it belongs to a folder in the library
|
648
|
+
folder = next(
|
649
|
+
(
|
650
|
+
folder
|
651
|
+
for folder in library["folders"]
|
652
|
+
if str(file_path).startswith(folder["path"])
|
653
|
+
),
|
654
|
+
None,
|
655
|
+
)
|
656
|
+
|
657
|
+
if folder:
|
658
|
+
# Create new entity
|
659
|
+
new_entity["folder_id"] = folder["id"]
|
660
|
+
|
661
|
+
create_response = httpx.post(
|
662
|
+
f"{BASE_URL}/libraries/{library_id}/entities",
|
663
|
+
json=new_entity,
|
664
|
+
params={
|
665
|
+
"trigger_webhooks_flag": str(not without_webhooks).lower(),
|
666
|
+
"update_index": "true",
|
667
|
+
},
|
668
|
+
timeout=60,
|
669
|
+
)
|
670
|
+
|
671
|
+
if create_response.status_code == 200:
|
672
|
+
typer.echo(f"Created new entity for file: {file_path}")
|
673
|
+
else:
|
674
|
+
typer.echo(
|
675
|
+
f"Error creating entity: {create_response.status_code} - {create_response.text}"
|
676
|
+
)
|
677
|
+
|
678
|
+
else:
|
679
|
+
# 4. File doesn't belong to any folder in the library
|
680
|
+
typer.echo(
|
681
|
+
f"Error: File {file_path} does not belong to any folder in the library."
|
682
|
+
)
|
683
|
+
raise typer.Exit(code=1)
|
684
|
+
|
685
|
+
|
686
|
+
@lru_cache(maxsize=1)
|
687
|
+
def is_on_battery():
|
688
|
+
try:
|
689
|
+
battery = psutil.sensors_battery()
|
690
|
+
return battery is not None and not battery.power_plugged
|
691
|
+
except:
|
692
|
+
return False # If unable to detect battery status, assume not on battery
|
693
|
+
|
694
|
+
|
695
|
+
# Modify the LibraryFileHandler class
|
696
|
+
class LibraryFileHandler(FileSystemEventHandler):
|
697
|
+
def __init__(
|
698
|
+
self,
|
699
|
+
library_id,
|
700
|
+
include_files,
|
701
|
+
max_workers=2,
|
702
|
+
sparsity_factor=3,
|
703
|
+
rate_window_size=10,
|
704
|
+
processing_interval=12,
|
705
|
+
):
|
706
|
+
self.library_id = library_id
|
707
|
+
self.include_files = include_files
|
708
|
+
self.inode_pattern = re.compile(r"\._.+")
|
709
|
+
self.pending_files = defaultdict(lambda: {"timestamp": 0, "last_size": 0})
|
710
|
+
self.buffer_time = 2
|
711
|
+
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
712
|
+
self.lock = threading.Lock()
|
713
|
+
|
714
|
+
self.processing_interval = processing_interval
|
715
|
+
self.sparsity_factor = sparsity_factor
|
716
|
+
self.rate_window_size = rate_window_size
|
717
|
+
|
718
|
+
self.file_change_intervals = deque(maxlen=rate_window_size)
|
719
|
+
self.file_processing_durations = deque(maxlen=rate_window_size)
|
720
|
+
|
721
|
+
self.file_count = 0
|
722
|
+
self.file_submitted = 0
|
723
|
+
self.file_synced = 0
|
724
|
+
self.file_skipped = 0
|
725
|
+
self.logger = logger
|
726
|
+
|
727
|
+
self.last_battery_check = 0
|
728
|
+
self.battery_check_interval = 60 # Check battery status every 60 seconds
|
729
|
+
|
730
|
+
def handle_event(self, event):
|
731
|
+
if not event.is_directory and self.is_valid_file(event.src_path):
|
732
|
+
current_time = time.time()
|
733
|
+
with self.lock:
|
734
|
+
file_info = self.pending_files[event.src_path]
|
735
|
+
|
736
|
+
if current_time - file_info["timestamp"] > self.buffer_time:
|
737
|
+
file_info["timestamp"] = current_time
|
738
|
+
self.file_change_intervals.append(current_time)
|
739
|
+
|
740
|
+
file_info["last_size"] = os.path.getsize(event.src_path)
|
741
|
+
|
742
|
+
return True
|
743
|
+
return False
|
744
|
+
|
745
|
+
def process_pending_files(self):
|
746
|
+
current_time = time.time()
|
747
|
+
files_to_process_with_plugins = []
|
748
|
+
files_to_process_without_plugins = []
|
749
|
+
processed_in_current_loop = 0
|
750
|
+
with self.lock:
|
751
|
+
for path, file_info in list(self.pending_files.items()):
|
752
|
+
if current_time - file_info["timestamp"] > self.buffer_time:
|
753
|
+
processed_in_current_loop += 1
|
754
|
+
if os.path.exists(path) and os.path.getsize(path) > 0:
|
755
|
+
self.file_count += 1
|
756
|
+
if self.file_count % self.processing_interval == 0:
|
757
|
+
files_to_process_with_plugins.append(path)
|
758
|
+
print(
|
759
|
+
f"file_count % processing_interval: {self.file_count} % {self.processing_interval} == 0"
|
760
|
+
)
|
761
|
+
print(f"Picked file for processing with plugins: {path}")
|
762
|
+
else:
|
763
|
+
files_to_process_without_plugins.append(path)
|
764
|
+
self.file_skipped += 1
|
765
|
+
del self.pending_files[path]
|
766
|
+
elif not os.path.exists(path):
|
767
|
+
del self.pending_files[path]
|
768
|
+
|
769
|
+
# Process files with plugins - these count as submitted
|
770
|
+
for path in files_to_process_with_plugins:
|
771
|
+
self.executor.submit(self.process_file, path, False)
|
772
|
+
self.file_submitted += 1
|
773
|
+
|
774
|
+
# Process files without plugins - these don't count as submitted
|
775
|
+
for path in files_to_process_without_plugins:
|
776
|
+
self.executor.submit(self.process_file, path, True)
|
777
|
+
|
778
|
+
if processed_in_current_loop > 0:
|
779
|
+
self.logger.info(
|
780
|
+
f"File count: {self.file_count}, Files submitted: {self.file_submitted}, Files synced: {self.file_synced}, Files skipped: {self.file_skipped}"
|
781
|
+
)
|
782
|
+
|
783
|
+
self.update_processing_interval()
|
784
|
+
|
785
|
+
def process_file(self, path, no_plugins):
|
786
|
+
self.logger.debug(f"Processing file: {path} (with plugins: {not no_plugins})")
|
787
|
+
start_time = time.time()
|
788
|
+
sync(self.library_id, path, without_webhooks=no_plugins)
|
789
|
+
end_time = time.time()
|
790
|
+
if not no_plugins:
|
791
|
+
with self.lock:
|
792
|
+
self.file_processing_durations.append(end_time - start_time)
|
793
|
+
self.file_synced += 1
|
794
|
+
|
795
|
+
def update_processing_interval(self):
|
796
|
+
min_samples = max(3, self.rate_window_size // 3)
|
797
|
+
max_interval = 60 # Maximum allowed interval between events in seconds
|
798
|
+
|
799
|
+
if (
|
800
|
+
len(self.file_change_intervals) >= min_samples
|
801
|
+
and len(self.file_processing_durations) >= min_samples
|
802
|
+
):
|
803
|
+
# Filter out large time gaps
|
804
|
+
filtered_intervals = [
|
805
|
+
self.file_change_intervals[i] - self.file_change_intervals[i - 1]
|
806
|
+
for i in range(1, len(self.file_change_intervals))
|
807
|
+
if self.file_change_intervals[i] - self.file_change_intervals[i - 1]
|
808
|
+
<= max_interval
|
809
|
+
]
|
810
|
+
|
811
|
+
if filtered_intervals:
|
812
|
+
avg_change_interval = sum(filtered_intervals) / len(filtered_intervals)
|
813
|
+
changes_per_second = (
|
814
|
+
1 / avg_change_interval if avg_change_interval > 0 else 0
|
815
|
+
)
|
816
|
+
else:
|
817
|
+
changes_per_second = 0
|
818
|
+
|
819
|
+
total_processing_time = sum(self.file_processing_durations)
|
820
|
+
processing_per_second = (
|
821
|
+
len(self.file_processing_durations) / total_processing_time
|
822
|
+
if total_processing_time > 0
|
823
|
+
else 0
|
824
|
+
)
|
825
|
+
|
826
|
+
if changes_per_second > 0 and processing_per_second > 0:
|
827
|
+
rate = changes_per_second / processing_per_second
|
828
|
+
new_processing_interval = max(1, math.ceil(self.sparsity_factor * rate))
|
829
|
+
|
830
|
+
current_time = time.time()
|
831
|
+
if current_time - self.last_battery_check > self.battery_check_interval:
|
832
|
+
self.last_battery_check = current_time
|
833
|
+
is_on_battery.cache_clear() # Clear the cache to get fresh battery status
|
834
|
+
if is_on_battery():
|
835
|
+
new_processing_interval *= 2
|
836
|
+
self.logger.info(
|
837
|
+
"Running on battery, doubling the processing interval."
|
838
|
+
)
|
839
|
+
|
840
|
+
if new_processing_interval != self.processing_interval:
|
841
|
+
old_processing_interval = self.processing_interval
|
842
|
+
self.processing_interval = new_processing_interval
|
843
|
+
self.logger.info(
|
844
|
+
f"Processing interval: {old_processing_interval} -> {self.processing_interval}, Changes: {changes_per_second:.2f}it/s, Processing: {processing_per_second:.2f}it/s, Rate (changes/processing): {rate:.2f}"
|
845
|
+
)
|
846
|
+
|
847
|
+
def is_valid_file(self, path):
|
848
|
+
filename = os.path.basename(path)
|
849
|
+
return (
|
850
|
+
any(path.lower().endswith(ext) for ext in self.include_files)
|
851
|
+
and not is_temp_file(filename)
|
852
|
+
and not self.inode_pattern.match(filename)
|
853
|
+
)
|
854
|
+
|
855
|
+
def on_created(self, event):
|
856
|
+
self.handle_event(event)
|
857
|
+
|
858
|
+
def on_modified(self, event):
|
859
|
+
self.handle_event(event)
|
860
|
+
|
861
|
+
def on_moved(self, event):
|
862
|
+
if self.handle_event(event):
|
863
|
+
# For moved events, we need to update the key in pending_files
|
864
|
+
with self.lock:
|
865
|
+
self.pending_files[event.dest_path] = self.pending_files.pop(
|
866
|
+
event.src_path, {"timestamp": time.time(), "last_size": 0}
|
867
|
+
)
|
868
|
+
|
869
|
+
def on_deleted(self, event):
|
870
|
+
if self.is_valid_file(event.src_path):
|
871
|
+
self.logger.info(f"File deleted: {event.src_path}")
|
872
|
+
# Remove from pending files if it was there
|
873
|
+
with self.lock:
|
874
|
+
self.pending_files.pop(event.src_path, None)
|
875
|
+
# Add logic for handling deleted files if needed
|
876
|
+
|
877
|
+
|
878
|
+
@lib_app.command("watch")
|
879
|
+
def watch(
|
880
|
+
library_id: int,
|
881
|
+
folders: List[int] = typer.Option(
|
882
|
+
None, "--folder", "-f", help="Specify folders to watch"
|
883
|
+
),
|
884
|
+
sparsity_factor: float = typer.Option(
|
885
|
+
3.0, "--sparsity-factor", "-sf", help="Sparsity factor for file processing"
|
886
|
+
),
|
887
|
+
processing_interval: int = typer.Option(
|
888
|
+
12,
|
889
|
+
"--processing-interval",
|
890
|
+
"-pi",
|
891
|
+
help="Process one file with plugins for every N files (higher means less frequent processing)",
|
892
|
+
),
|
893
|
+
rate_window_size: int = typer.Option(
|
894
|
+
10,
|
895
|
+
"--rate-window",
|
896
|
+
"-rw",
|
897
|
+
help="Number of recent events to consider when calculating processing rates",
|
898
|
+
),
|
899
|
+
verbose: bool = typer.Option(
|
900
|
+
False, "--verbose", "-v", help="Enable verbose logging"
|
901
|
+
),
|
902
|
+
):
|
903
|
+
"""
|
904
|
+
Watch for file changes in the library folders and sync automatically.
|
905
|
+
"""
|
906
|
+
# Set the logging level based on the verbose flag
|
907
|
+
log_level = "DEBUG" if verbose else "INFO"
|
908
|
+
logger.setLevel(log_level)
|
909
|
+
|
910
|
+
logger.info(f"Watching library {library_id} for changes...")
|
911
|
+
|
912
|
+
# Get the library
|
913
|
+
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
914
|
+
if response.status_code != 200:
|
915
|
+
print(f"Error: Library with id {library_id} not found.")
|
916
|
+
raise typer.Exit(code=1)
|
917
|
+
|
918
|
+
library = response.json()
|
919
|
+
|
920
|
+
# Filter folders if the folders parameter is provided
|
921
|
+
if folders:
|
922
|
+
library_folders = [
|
923
|
+
folder for folder in library["folders"] if folder["id"] in folders
|
924
|
+
]
|
925
|
+
else:
|
926
|
+
library_folders = library["folders"]
|
927
|
+
|
928
|
+
if not library_folders:
|
929
|
+
print("No folders to watch.")
|
930
|
+
return
|
931
|
+
|
932
|
+
# Create an observer and handler for each folder in the library
|
933
|
+
observer = Observer()
|
934
|
+
handlers = []
|
935
|
+
for folder in library_folders:
|
936
|
+
folder_path = Path(folder["path"])
|
937
|
+
event_handler = LibraryFileHandler(
|
938
|
+
library_id,
|
939
|
+
include_files,
|
940
|
+
sparsity_factor=sparsity_factor,
|
941
|
+
processing_interval=processing_interval,
|
942
|
+
rate_window_size=rate_window_size,
|
943
|
+
)
|
944
|
+
handlers.append(event_handler)
|
945
|
+
observer.schedule(event_handler, str(folder_path), recursive=True)
|
946
|
+
print(f"Watching folder: {folder_path}")
|
947
|
+
|
948
|
+
observer.start()
|
949
|
+
try:
|
950
|
+
while True:
|
951
|
+
time.sleep(5)
|
952
|
+
for handler in handlers:
|
953
|
+
handler.process_pending_files()
|
954
|
+
except KeyboardInterrupt:
|
955
|
+
observer.stop()
|
956
|
+
for handler in handlers:
|
957
|
+
handler.executor.shutdown(wait=True)
|
958
|
+
observer.join()
|
959
|
+
|
960
|
+
|
961
|
+
async def collect_candidate_files(folder_path: Path) -> List[str]:
|
962
|
+
"""
|
963
|
+
Collect candidate files to be processed
|
964
|
+
|
965
|
+
Args:
|
966
|
+
folder_path: Folder path
|
967
|
+
|
968
|
+
Returns:
|
969
|
+
List[str]: List of candidate file paths
|
970
|
+
"""
|
971
|
+
candidate_files = []
|
972
|
+
for root, _, files in os.walk(folder_path):
|
973
|
+
with tqdm(total=len(files), desc=f"Scanning {root}", leave=True) as pbar:
|
974
|
+
for file in files:
|
975
|
+
file_path = Path(root) / file
|
976
|
+
absolute_file_path = file_path.resolve()
|
977
|
+
|
978
|
+
# Check if the file extension is in the include_files list and is not a temporary file
|
979
|
+
if file_path.suffix.lower() in include_files and not is_temp_file(file):
|
980
|
+
candidate_files.append(str(absolute_file_path))
|
981
|
+
pbar.update(1)
|
982
|
+
|
983
|
+
return candidate_files
|
984
|
+
|
985
|
+
|
986
|
+
async def prepare_entity(file_path: str, folder_id: int) -> Dict[str, Any]:
|
987
|
+
"""
|
988
|
+
Prepare entity data
|
989
|
+
|
990
|
+
Args:
|
991
|
+
file_path: File path
|
992
|
+
folder_id: Folder ID
|
993
|
+
|
994
|
+
Returns:
|
995
|
+
Dict[str, Any]: Entity data
|
996
|
+
"""
|
997
|
+
file_path = Path(file_path)
|
998
|
+
file_stat = file_path.stat()
|
999
|
+
file_type, file_type_group = get_file_type(file_path)
|
1000
|
+
|
1001
|
+
new_entity = {
|
1002
|
+
"filename": file_path.name,
|
1003
|
+
"filepath": str(file_path),
|
1004
|
+
"size": file_stat.st_size,
|
1005
|
+
"file_created_at": format_timestamp(file_stat.st_ctime),
|
1006
|
+
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
|
1007
|
+
"file_type": file_type,
|
1008
|
+
"file_type_group": file_type_group,
|
1009
|
+
"folder_id": folder_id,
|
1010
|
+
}
|
1011
|
+
|
1012
|
+
# Handle image metadata
|
1013
|
+
is_thumbnail = False
|
1014
|
+
if file_type_group == "image":
|
1015
|
+
metadata = get_image_metadata(file_path)
|
1016
|
+
if metadata:
|
1017
|
+
if "active_window" in metadata and "active_app" not in metadata:
|
1018
|
+
metadata["active_app"] = metadata["active_window"].split(" - ")[0]
|
1019
|
+
new_entity["metadata_entries"] = [
|
1020
|
+
{
|
1021
|
+
"key": key,
|
1022
|
+
"value": str(value),
|
1023
|
+
"source": MetadataSource.SYSTEM_GENERATED.value,
|
1024
|
+
"data_type": (
|
1025
|
+
"number" if isinstance(value, (int, float)) else "text"
|
1026
|
+
),
|
1027
|
+
}
|
1028
|
+
for key, value in metadata.items()
|
1029
|
+
if key != IS_THUMBNAIL
|
1030
|
+
]
|
1031
|
+
if "active_app" in metadata:
|
1032
|
+
new_entity.setdefault("tags", []).append(metadata["active_app"])
|
1033
|
+
is_thumbnail = metadata.get(IS_THUMBNAIL, False)
|
1034
|
+
|
1035
|
+
new_entity["is_thumbnail"] = is_thumbnail
|
1036
|
+
return new_entity
|
1037
|
+
|
1038
|
+
|
1039
|
+
def format_error_message(
|
1040
|
+
file_status: FileStatus, response: Optional[httpx.Response]
|
1041
|
+
) -> str:
|
1042
|
+
"""
|
1043
|
+
Format error message
|
1044
|
+
|
1045
|
+
Args:
|
1046
|
+
file_status: File status
|
1047
|
+
response: HTTP response
|
1048
|
+
|
1049
|
+
Returns:
|
1050
|
+
str: Formatted error message
|
1051
|
+
"""
|
1052
|
+
action = "add" if file_status == FileStatus.ADDED else "update"
|
1053
|
+
error_message = f"Failed to {action} file"
|
1054
|
+
|
1055
|
+
if response:
|
1056
|
+
if hasattr(response, "status_code"):
|
1057
|
+
error_message += f": {response.status_code}"
|
1058
|
+
if hasattr(response, "text"):
|
1059
|
+
error_message += f" - {response.text}"
|
1060
|
+
else:
|
1061
|
+
error_message += " - Unknown error occurred"
|
1062
|
+
|
1063
|
+
return error_message
|
1064
|
+
|
1065
|
+
|
1066
|
+
async def process_file_batches(
|
1067
|
+
client: httpx.AsyncClient,
|
1068
|
+
library: dict,
|
1069
|
+
folder: dict,
|
1070
|
+
candidate_files: list,
|
1071
|
+
force: bool,
|
1072
|
+
plugins: list,
|
1073
|
+
semaphore: asyncio.Semaphore,
|
1074
|
+
) -> Tuple[int, int]:
|
1075
|
+
"""
|
1076
|
+
Process file batches
|
1077
|
+
|
1078
|
+
Args:
|
1079
|
+
client: httpx async client
|
1080
|
+
library: Library object
|
1081
|
+
folder: Folder information
|
1082
|
+
candidate_files: List of candidate files
|
1083
|
+
force: Whether to force update
|
1084
|
+
plugins: List of plugins
|
1085
|
+
semaphore: Concurrency control semaphore
|
1086
|
+
|
1087
|
+
Returns:
|
1088
|
+
Tuple[int, int]: (Number of files added, Number of files updated)
|
1089
|
+
"""
|
1090
|
+
added_file_count = 0
|
1091
|
+
updated_file_count = 0
|
1092
|
+
batching = 50
|
1093
|
+
|
1094
|
+
library_id = library.get('id')
|
1095
|
+
library_plugins = [plugin.get('id') for plugin in library.get('plugins', [])]
|
1096
|
+
target_plugins = library_plugins if plugins is None else [plugin for plugin in library_plugins if plugin in plugins]
|
1097
|
+
|
1098
|
+
with tqdm(total=len(candidate_files), desc="Processing files", leave=True) as pbar:
|
1099
|
+
for i in range(0, len(candidate_files), batching):
|
1100
|
+
batch = candidate_files[i : i + batching]
|
1101
|
+
|
1102
|
+
# Get existing entities in the batch
|
1103
|
+
get_response = await client.post(
|
1104
|
+
f"{BASE_URL}/libraries/{library_id}/entities/by-filepaths",
|
1105
|
+
json=batch,
|
1106
|
+
)
|
1107
|
+
|
1108
|
+
if get_response.status_code != 200:
|
1109
|
+
print(
|
1110
|
+
f"Failed to get entities: {get_response.status_code} - {get_response.text}"
|
1111
|
+
)
|
1112
|
+
pbar.update(len(batch))
|
1113
|
+
continue
|
1114
|
+
|
1115
|
+
existing_entities = get_response.json()
|
1116
|
+
existing_entities_dict = {
|
1117
|
+
entity["filepath"]: entity for entity in existing_entities
|
1118
|
+
}
|
1119
|
+
|
1120
|
+
# Process each file
|
1121
|
+
tasks = []
|
1122
|
+
for file_path in batch:
|
1123
|
+
new_entity = await prepare_entity(file_path, folder["id"])
|
1124
|
+
|
1125
|
+
if new_entity.get("is_thumbnail", False):
|
1126
|
+
typer.echo(f"Skipping thumbnail file: {file_path}")
|
1127
|
+
continue
|
1128
|
+
|
1129
|
+
existing_entity = existing_entities_dict.get(str(file_path))
|
1130
|
+
if existing_entity:
|
1131
|
+
if force:
|
1132
|
+
# Directly update without merging if force is true
|
1133
|
+
tasks.append(
|
1134
|
+
update_entity(
|
1135
|
+
client, semaphore, plugins, new_entity, existing_entity
|
1136
|
+
)
|
1137
|
+
)
|
1138
|
+
else:
|
1139
|
+
# Merge existing metadata with new metadata
|
1140
|
+
new_metadata_keys = {
|
1141
|
+
entry["key"]
|
1142
|
+
for entry in new_entity.get("metadata_entries", [])
|
1143
|
+
}
|
1144
|
+
for existing_entry in existing_entity.get(
|
1145
|
+
"metadata_entries", []
|
1146
|
+
):
|
1147
|
+
if existing_entry["key"] not in new_metadata_keys:
|
1148
|
+
new_entity.setdefault("metadata_entries", []).append(
|
1149
|
+
existing_entry
|
1150
|
+
)
|
1151
|
+
|
1152
|
+
# Merge existing tags with new tags
|
1153
|
+
existing_tags = {
|
1154
|
+
tag["name"] for tag in existing_entity.get("tags", [])
|
1155
|
+
}
|
1156
|
+
new_tags = set(new_entity.get("tags", []))
|
1157
|
+
merged_tags = new_tags.union(existing_tags)
|
1158
|
+
new_entity["tags"] = list(merged_tags)
|
1159
|
+
|
1160
|
+
# Check if the entity needs to be processed by any plugins
|
1161
|
+
processed_plugins = {plugin_status.get("plugin_id") for plugin_status in existing_entity.get("plugin_status", [])}
|
1162
|
+
has_unprocessed_plugins = any(plugin_id not in processed_plugins for plugin_id in target_plugins)
|
1163
|
+
|
1164
|
+
# Only update if there are actual changes or the entity needs to be processed by any plugins
|
1165
|
+
if has_unprocessed_plugins or has_entity_changes(new_entity, existing_entity):
|
1166
|
+
tasks.append(
|
1167
|
+
update_entity(
|
1168
|
+
client, semaphore, plugins, new_entity, existing_entity
|
1169
|
+
)
|
1170
|
+
)
|
1171
|
+
else:
|
1172
|
+
pbar.write(f"Skipping file: {file_path} #{existing_entity.get('id')}")
|
1173
|
+
pbar.update(1)
|
1174
|
+
continue
|
1175
|
+
else:
|
1176
|
+
tasks.append(
|
1177
|
+
add_entity(client, semaphore, library_id, plugins, new_entity)
|
1178
|
+
)
|
1179
|
+
|
1180
|
+
# Process task results
|
1181
|
+
if tasks:
|
1182
|
+
for future in asyncio.as_completed(tasks):
|
1183
|
+
file_path, file_status, succeeded, response = await future
|
1184
|
+
if succeeded:
|
1185
|
+
if file_status == FileStatus.ADDED:
|
1186
|
+
added_file_count += 1
|
1187
|
+
tqdm.write(f"Added file to library: {file_path}")
|
1188
|
+
else:
|
1189
|
+
updated_file_count += 1
|
1190
|
+
tqdm.write(f"Updated file in library: {file_path}")
|
1191
|
+
else:
|
1192
|
+
error_message = format_error_message(file_status, response)
|
1193
|
+
tqdm.write(error_message)
|
1194
|
+
|
1195
|
+
# Update progress bar for each file processed
|
1196
|
+
pbar.update(1)
|
1197
|
+
pbar.set_postfix(
|
1198
|
+
{"Added": added_file_count, "Updated": updated_file_count},
|
1199
|
+
refresh=True,
|
1200
|
+
)
|
1201
|
+
|
1202
|
+
return added_file_count, updated_file_count
|
1203
|
+
|
1204
|
+
|
1205
|
+
async def check_deleted_files(
|
1206
|
+
client: httpx.AsyncClient,
|
1207
|
+
library_id: int,
|
1208
|
+
folder: dict,
|
1209
|
+
folder_path: Path,
|
1210
|
+
scanned_files: Set[str],
|
1211
|
+
) -> int:
|
1212
|
+
"""
|
1213
|
+
Check and handle deleted files
|
1214
|
+
|
1215
|
+
Args:
|
1216
|
+
client: httpx async client
|
1217
|
+
library_id: Library ID
|
1218
|
+
folder: Folder information
|
1219
|
+
folder_path: Folder path
|
1220
|
+
scanned_files: Set of scanned files
|
1221
|
+
|
1222
|
+
Returns:
|
1223
|
+
int: Number of deleted files
|
1224
|
+
"""
|
1225
|
+
deleted_count = 0
|
1226
|
+
limit = 100
|
1227
|
+
offset = 0
|
1228
|
+
total_entities = 0
|
1229
|
+
|
1230
|
+
with tqdm(
|
1231
|
+
total=total_entities, desc="Checking for deleted files", leave=True
|
1232
|
+
) as pbar:
|
1233
|
+
while True:
|
1234
|
+
# Add path_prefix parameter to only get entities under the folder_path
|
1235
|
+
existing_files_response = await client.get(
|
1236
|
+
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
1237
|
+
params={
|
1238
|
+
"limit": limit,
|
1239
|
+
"offset": offset,
|
1240
|
+
"path_prefix": str(folder_path),
|
1241
|
+
},
|
1242
|
+
timeout=60,
|
1243
|
+
)
|
1244
|
+
|
1245
|
+
if existing_files_response.status_code != 200:
|
1246
|
+
pbar.write(
|
1247
|
+
f"Failed to retrieve existing files: {existing_files_response.status_code} - {existing_files_response.text}"
|
1248
|
+
)
|
1249
|
+
break
|
1250
|
+
|
1251
|
+
existing_files = existing_files_response.json()
|
1252
|
+
if not existing_files:
|
1253
|
+
break
|
1254
|
+
|
1255
|
+
# Update total count (if this is the first request)
|
1256
|
+
if offset == 0:
|
1257
|
+
total_entities = int(
|
1258
|
+
existing_files_response.headers.get("X-Total-Count", total_entities)
|
1259
|
+
)
|
1260
|
+
pbar.total = total_entities
|
1261
|
+
pbar.refresh()
|
1262
|
+
|
1263
|
+
for existing_file in existing_files:
|
1264
|
+
if (
|
1265
|
+
# path_prefix may include files not in the folder_path,
|
1266
|
+
# for example when folder_path is 20241101 but there is another folder 20241101-copy
|
1267
|
+
# so check the existing_file is relative_to folder_path is required,
|
1268
|
+
# do not remove this.
|
1269
|
+
Path(existing_file["filepath"]).is_relative_to(folder_path)
|
1270
|
+
and existing_file["filepath"] not in scanned_files
|
1271
|
+
):
|
1272
|
+
# File has been deleted
|
1273
|
+
delete_response = await client.delete(
|
1274
|
+
f"{BASE_URL}/libraries/{library_id}/entities/{existing_file['id']}"
|
1275
|
+
)
|
1276
|
+
if 200 <= delete_response.status_code < 300:
|
1277
|
+
pbar.write(
|
1278
|
+
f"Deleted file from library: {existing_file['filepath']}"
|
1279
|
+
)
|
1280
|
+
deleted_count += 1
|
1281
|
+
else:
|
1282
|
+
pbar.write(
|
1283
|
+
f"Failed to delete file: {delete_response.status_code} - {delete_response.text}"
|
1284
|
+
)
|
1285
|
+
pbar.update(1)
|
1286
|
+
|
1287
|
+
offset += limit
|
1288
|
+
|
1289
|
+
return deleted_count
|