pensiev 0.25.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. memos/__init__.py +6 -0
  2. memos/cmds/__init__.py +0 -0
  3. memos/cmds/library.py +1289 -0
  4. memos/cmds/plugin.py +96 -0
  5. memos/commands.py +865 -0
  6. memos/config.py +225 -0
  7. memos/crud.py +605 -0
  8. memos/databases/__init__.py +0 -0
  9. memos/databases/initializers.py +481 -0
  10. memos/dataset_extractor_for_florence.py +165 -0
  11. memos/dataset_extractor_for_internvl2.py +192 -0
  12. memos/default_config.yaml +88 -0
  13. memos/embedding.py +129 -0
  14. memos/frame_extractor.py +53 -0
  15. memos/logging_config.py +35 -0
  16. memos/main.py +104 -0
  17. memos/migrations/alembic/README +1 -0
  18. memos/migrations/alembic/__pycache__/env.cpython-310.pyc +0 -0
  19. memos/migrations/alembic/env.py +108 -0
  20. memos/migrations/alembic/script.py.mako +30 -0
  21. memos/migrations/alembic/versions/00904ac8c6fc_add_indexes_to_entitymodel.py +63 -0
  22. memos/migrations/alembic/versions/04acdaf75664_add_indices_to_entitytags_and_metadata.py +86 -0
  23. memos/migrations/alembic/versions/12504c5b1d3c_add_extra_columns_for_embedding.py +67 -0
  24. memos/migrations/alembic/versions/31a1ad0e10b3_add_entity_plugin_status.py +71 -0
  25. memos/migrations/alembic/versions/__pycache__/00904ac8c6fc_add_indexes_to_entitymodel.cpython-310.pyc +0 -0
  26. memos/migrations/alembic/versions/__pycache__/04acdaf75664_add_indices_to_entitytags_and_metadata.cpython-310.pyc +0 -0
  27. memos/migrations/alembic/versions/__pycache__/12504c5b1d3c_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
  28. memos/migrations/alembic/versions/__pycache__/20f5ecab014d_add_entity_plugin_status.cpython-310.pyc +0 -0
  29. memos/migrations/alembic/versions/__pycache__/31a1ad0e10b3_add_entity_plugin_status.cpython-310.pyc +0 -0
  30. memos/migrations/alembic/versions/__pycache__/4fcb062c5128_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
  31. memos/migrations/alembic/versions/__pycache__/d10c55fbb7d2_add_index_for_entity_file_type_group_.cpython-310.pyc +0 -0
  32. memos/migrations/alembic/versions/__pycache__/f8f158182416_add_active_app_index.cpython-310.pyc +0 -0
  33. memos/migrations/alembic/versions/d10c55fbb7d2_add_index_for_entity_file_type_group_.py +44 -0
  34. memos/migrations/alembic/versions/f8f158182416_add_active_app_index.py +75 -0
  35. memos/migrations/alembic.ini +116 -0
  36. memos/migrations.py +19 -0
  37. memos/models.py +199 -0
  38. memos/plugins/__init__.py +0 -0
  39. memos/plugins/ocr/__init__.py +0 -0
  40. memos/plugins/ocr/main.py +251 -0
  41. memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx +0 -0
  42. memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx +0 -0
  43. memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx +0 -0
  44. memos/plugins/ocr/ppocr-gpu.yaml +43 -0
  45. memos/plugins/ocr/ppocr.yaml +44 -0
  46. memos/plugins/ocr/server.py +227 -0
  47. memos/plugins/ocr/temp_ppocr.yaml +42 -0
  48. memos/plugins/vlm/__init__.py +0 -0
  49. memos/plugins/vlm/main.py +251 -0
  50. memos/prepare_dataset.py +107 -0
  51. memos/process_webp.py +55 -0
  52. memos/read_metadata.py +32 -0
  53. memos/record.py +358 -0
  54. memos/schemas.py +289 -0
  55. memos/search.py +1198 -0
  56. memos/server.py +883 -0
  57. memos/shotsum.py +105 -0
  58. memos/shotsum_with_ocr.py +145 -0
  59. memos/simple_tokenizer/dict/README.md +31 -0
  60. memos/simple_tokenizer/dict/hmm_model.utf8 +34 -0
  61. memos/simple_tokenizer/dict/idf.utf8 +258826 -0
  62. memos/simple_tokenizer/dict/jieba.dict.utf8 +348982 -0
  63. memos/simple_tokenizer/dict/pos_dict/char_state_tab.utf8 +6653 -0
  64. memos/simple_tokenizer/dict/pos_dict/prob_emit.utf8 +166 -0
  65. memos/simple_tokenizer/dict/pos_dict/prob_start.utf8 +259 -0
  66. memos/simple_tokenizer/dict/pos_dict/prob_trans.utf8 +5222 -0
  67. memos/simple_tokenizer/dict/stop_words.utf8 +1534 -0
  68. memos/simple_tokenizer/dict/user.dict.utf8 +4 -0
  69. memos/simple_tokenizer/linux/libsimple.so +0 -0
  70. memos/simple_tokenizer/macos/libsimple.dylib +0 -0
  71. memos/simple_tokenizer/windows/simple.dll +0 -0
  72. memos/static/_app/immutable/assets/0.e250c031.css +1 -0
  73. memos/static/_app/immutable/assets/_layout.e7937cfe.css +1 -0
  74. memos/static/_app/immutable/chunks/index.5c08976b.js +1 -0
  75. memos/static/_app/immutable/chunks/index.60ee613b.js +4 -0
  76. memos/static/_app/immutable/chunks/runtime.a7926cf6.js +5 -0
  77. memos/static/_app/immutable/chunks/scheduler.5c1cff6e.js +1 -0
  78. memos/static/_app/immutable/chunks/singletons.583bdf4e.js +1 -0
  79. memos/static/_app/immutable/entry/app.666c1643.js +1 -0
  80. memos/static/_app/immutable/entry/start.aed5c701.js +3 -0
  81. memos/static/_app/immutable/nodes/0.5862ea38.js +7 -0
  82. memos/static/_app/immutable/nodes/1.35378a5e.js +1 -0
  83. memos/static/_app/immutable/nodes/2.1ccf9ea5.js +81 -0
  84. memos/static/_app/version.json +1 -0
  85. memos/static/app.html +36 -0
  86. memos/static/favicon.png +0 -0
  87. memos/static/logos/memos_logo_1024.png +0 -0
  88. memos/static/logos/memos_logo_1024@2x.png +0 -0
  89. memos/static/logos/memos_logo_128.png +0 -0
  90. memos/static/logos/memos_logo_128@2x.png +0 -0
  91. memos/static/logos/memos_logo_16.png +0 -0
  92. memos/static/logos/memos_logo_16@2x.png +0 -0
  93. memos/static/logos/memos_logo_256.png +0 -0
  94. memos/static/logos/memos_logo_256@2x.png +0 -0
  95. memos/static/logos/memos_logo_32.png +0 -0
  96. memos/static/logos/memos_logo_32@2x.png +0 -0
  97. memos/static/logos/memos_logo_512.png +0 -0
  98. memos/static/logos/memos_logo_512@2x.png +0 -0
  99. memos/static/logos/memos_logo_64.png +0 -0
  100. memos/static/logos/memos_logo_64@2x.png +0 -0
  101. memos/test_server.py +802 -0
  102. memos/utils.py +49 -0
  103. memos_ml_backends/florence2_server.py +176 -0
  104. memos_ml_backends/qwen2vl_server.py +182 -0
  105. memos_ml_backends/schemas.py +48 -0
  106. pensiev-0.25.5.dist-info/LICENSE +201 -0
  107. pensiev-0.25.5.dist-info/METADATA +541 -0
  108. pensiev-0.25.5.dist-info/RECORD +111 -0
  109. pensiev-0.25.5.dist-info/WHEEL +5 -0
  110. pensiev-0.25.5.dist-info/entry_points.txt +2 -0
  111. pensiev-0.25.5.dist-info/top_level.txt +2 -0
memos/cmds/library.py ADDED
@@ -0,0 +1,1289 @@
1
+ # Standard library imports
2
+ import time
3
+ import math
4
+ import re
5
+ import os
6
+ import threading
7
+ import asyncio
8
+ import logging
9
+ import logging.config
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+ from enum import Enum
13
+ from typing import List, Tuple, Dict, Any, Optional, Set
14
+ from functools import lru_cache
15
+ from collections import defaultdict, deque
16
+
17
+ # Third-party imports
18
+ import typer
19
+ import httpx
20
+ from tqdm import tqdm
21
+ from tabulate import tabulate
22
+ import psutil
23
+ from watchdog.observers import Observer
24
+ from watchdog.events import FileSystemEventHandler
25
+ from concurrent.futures import ThreadPoolExecutor
26
+
27
+ # Local imports
28
+ from memos.config import settings
29
+ from memos.utils import get_image_metadata
30
+ from memos.schemas import MetadataSource
31
+ from memos.logging_config import LOGGING_CONFIG
32
+
33
+
34
+ logging.config.dictConfig(LOGGING_CONFIG)
35
+ logger = logging.getLogger(__name__)
36
+
37
+ lib_app = typer.Typer()
38
+
39
+ file_detector = None
40
+
41
+ IS_THUMBNAIL = "is_thumbnail"
42
+
43
+ BASE_URL = settings.server_endpoint
44
+
45
+ include_files = [".jpg", ".jpeg", ".png", ".webp"]
46
+
47
+
48
+ class FileStatus(Enum):
49
+ UPDATED = "updated"
50
+ ADDED = "added"
51
+
52
+
53
+ def format_timestamp(timestamp):
54
+ if isinstance(timestamp, str):
55
+ return timestamp
56
+ return datetime.fromtimestamp(timestamp).replace(tzinfo=None).isoformat()
57
+
58
+
59
+ def init_file_detector():
60
+ """Initialize the global file detector if not already initialized"""
61
+ global file_detector
62
+ if file_detector is None:
63
+ from magika import Magika
64
+
65
+ file_detector = Magika()
66
+ return file_detector
67
+
68
+
69
+ def get_file_type(file_path):
70
+ """Get file type using lazy-loaded detector"""
71
+ detector = init_file_detector()
72
+ file_result = detector.identify_path(file_path)
73
+ return file_result.output.ct_label, file_result.output.group
74
+
75
+
76
+ def display_libraries(libraries):
77
+ table = []
78
+ for library in libraries:
79
+ table.append(
80
+ [
81
+ library["id"],
82
+ library["name"],
83
+ "\n".join(
84
+ f"{folder['id']}: {folder['path']}" for folder in library["folders"]
85
+ ),
86
+ "\n".join(
87
+ f"{plugin['id']}: {plugin['name']} {plugin['webhook_url']}"
88
+ for plugin in library["plugins"]
89
+ ),
90
+ ]
91
+ )
92
+
93
+ print(
94
+ tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain")
95
+ )
96
+
97
+
98
+ @lib_app.command("ls")
99
+ def ls():
100
+ response = httpx.get(f"{BASE_URL}/libraries")
101
+ libraries = response.json()
102
+ display_libraries(libraries)
103
+
104
+
105
+ @lib_app.command("create")
106
+ def add(name: str, folders: List[str]):
107
+ absolute_folders = []
108
+ for folder in folders:
109
+ folder_path = Path(folder).resolve()
110
+ absolute_folders.append(
111
+ {
112
+ "path": str(folder_path),
113
+ "last_modified_at": datetime.fromtimestamp(
114
+ folder_path.stat().st_mtime
115
+ ).isoformat(),
116
+ }
117
+ )
118
+
119
+ response = httpx.post(
120
+ f"{BASE_URL}/libraries", json={"name": name, "folders": absolute_folders}
121
+ )
122
+ if 200 <= response.status_code < 300:
123
+ print("Library created successfully")
124
+ else:
125
+ print(f"Failed to create library: {response.status_code} - {response.text}")
126
+
127
+
128
+ @lib_app.command("add-folder")
129
+ def add_folder(library_id: int, folders: List[str]):
130
+ absolute_folders = []
131
+ for folder in folders:
132
+ folder_path = Path(folder).resolve()
133
+ absolute_folders.append(
134
+ {
135
+ "path": str(folder_path),
136
+ "last_modified_at": datetime.fromtimestamp(
137
+ folder_path.stat().st_mtime
138
+ ).isoformat(),
139
+ }
140
+ )
141
+
142
+ response = httpx.post(
143
+ f"{BASE_URL}/libraries/{library_id}/folders",
144
+ json={"folders": absolute_folders},
145
+ )
146
+ if 200 <= response.status_code < 300:
147
+ print("Folders added successfully")
148
+ library = response.json()
149
+ display_libraries([library])
150
+ else:
151
+ print(f"Failed to add folders: {response.status_code} - {response.text}")
152
+
153
+
154
+ @lib_app.command("show")
155
+ def show(library_id: int):
156
+ response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
157
+ if response.status_code == 200:
158
+ library = response.json()
159
+ display_libraries([library])
160
+ else:
161
+ print(f"Failed to retrieve library: {response.status_code} - {response.text}")
162
+
163
+
164
+ def is_temp_file(filename):
165
+ return (
166
+ filename.startswith(".")
167
+ or filename.startswith("tmp")
168
+ or filename.startswith("temp")
169
+ )
170
+
171
+
172
+ async def loop_files(library, folder, folder_path, force, plugins, batch_size):
173
+ """
174
+ Process files in the folder
175
+
176
+ Args:
177
+ library: Library object
178
+ folder: Folder information
179
+ folder_path: Folder path
180
+ force: Whether to force update
181
+ plugins: List of plugins
182
+ batch_size: Batch size
183
+
184
+ Returns:
185
+ Tuple[int, int, int]: (Number of files added, Number of files updated, Number of files deleted)
186
+ """
187
+ updated_file_count = 0
188
+ added_file_count = 0
189
+ deleted_file_count = 0
190
+ semaphore = asyncio.Semaphore(batch_size)
191
+
192
+ async with httpx.AsyncClient(timeout=60) as client:
193
+ # 1. Collect candidate files
194
+ candidate_files = await collect_candidate_files(folder_path)
195
+ scanned_files = set(candidate_files)
196
+
197
+ # 2. Process file batches
198
+ added_file_count, updated_file_count = await process_file_batches(
199
+ client,
200
+ library,
201
+ folder,
202
+ candidate_files,
203
+ force,
204
+ plugins,
205
+ semaphore,
206
+ )
207
+
208
+ # 3. Check for deleted files
209
+ deleted_file_count = await check_deleted_files(
210
+ client, library.get('id'), folder, folder_path, scanned_files
211
+ )
212
+
213
+ return added_file_count, updated_file_count, deleted_file_count
214
+
215
+
216
+ @lib_app.command("scan")
217
+ def scan(
218
+ library_id: int,
219
+ path: str = typer.Argument(None, help="Path to scan within the library"),
220
+ force: bool = False,
221
+ plugins: List[int] = typer.Option(None, "--plugin", "-p"),
222
+ folders: List[int] = typer.Option(None, "--folder", "-f"),
223
+ batch_size: int = typer.Option(
224
+ 1, "--batch-size", "-bs", help="Batch size for processing files"
225
+ ),
226
+ ):
227
+ # Check if both path and folders are provided
228
+ if path and folders:
229
+ print("Error: You cannot specify both a path and folders at the same time.")
230
+ return
231
+
232
+ response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
233
+ if response.status_code != 200:
234
+ print(f"Failed to retrieve library: {response.status_code} - {response.text}")
235
+ return
236
+
237
+ library = response.json()
238
+ total_files_added = 0
239
+ total_files_updated = 0
240
+ total_files_deleted = 0
241
+
242
+ # Filter folders if the folders parameter is provided
243
+ if folders:
244
+ library_folders = [
245
+ folder for folder in library["folders"] if folder["id"] in folders
246
+ ]
247
+ else:
248
+ library_folders = library["folders"]
249
+
250
+ # Check if a specific path is provided
251
+ if path:
252
+ path = Path(path).expanduser().resolve()
253
+ # Check if the path is a folder or a subdirectory of a library folder
254
+ folder = next(
255
+ (
256
+ folder
257
+ for folder in library_folders
258
+ if path.is_relative_to(Path(folder["path"]).resolve())
259
+ ),
260
+ None,
261
+ )
262
+ if not folder:
263
+ print(f"Error: The path {path} is not part of any folder in the library.")
264
+ return
265
+ # Only scan the specified path
266
+ library_folders = [{"id": folder["id"], "path": str(path)}]
267
+
268
+ for folder in library_folders:
269
+ folder_path = Path(folder["path"])
270
+ if not folder_path.exists() or not folder_path.is_dir():
271
+ tqdm.write(f"Folder does not exist or is not a directory: {folder_path}")
272
+ continue
273
+
274
+ added_file_count, updated_file_count, deleted_file_count = asyncio.run(
275
+ loop_files(library, folder, folder_path, force, plugins, batch_size)
276
+ )
277
+ total_files_added += added_file_count
278
+ total_files_updated += updated_file_count
279
+ total_files_deleted += deleted_file_count
280
+
281
+ print(f"Total files added: {total_files_added}")
282
+ print(f"Total files updated: {total_files_updated}")
283
+ print(f"Total files deleted: {total_files_deleted}")
284
+
285
+
286
+ async def add_entity(
287
+ client: httpx.AsyncClient,
288
+ semaphore: asyncio.Semaphore,
289
+ library_id,
290
+ plugins,
291
+ new_entity,
292
+ ) -> Tuple[FileStatus, bool, httpx.Response]:
293
+ async with semaphore:
294
+ MAX_RETRIES = 3
295
+ RETRY_DELAY = 2.0
296
+ for attempt in range(MAX_RETRIES):
297
+ try:
298
+ post_response = await client.post(
299
+ f"{BASE_URL}/libraries/{library_id}/entities",
300
+ json=new_entity,
301
+ params=(
302
+ {"plugins": plugins, "update_index": "true"}
303
+ if plugins
304
+ else {"update_index": "true"}
305
+ ),
306
+ timeout=60,
307
+ )
308
+ if 200 <= post_response.status_code < 300:
309
+ return new_entity["filepath"], FileStatus.ADDED, True, post_response
310
+ else:
311
+ return (
312
+ new_entity["filepath"],
313
+ FileStatus.ADDED,
314
+ False,
315
+ post_response,
316
+ )
317
+ except Exception as e:
318
+ logging.error(
319
+ f"Error while adding entity (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
320
+ )
321
+ if attempt < MAX_RETRIES - 1:
322
+ await asyncio.sleep(RETRY_DELAY)
323
+ else:
324
+ return new_entity["filepath"], FileStatus.ADDED, False, None
325
+
326
+
327
+ async def update_entity(
328
+ client: httpx.AsyncClient,
329
+ semaphore: asyncio.Semaphore,
330
+ plugins,
331
+ new_entity,
332
+ existing_entity,
333
+ ) -> Tuple[FileStatus, bool, httpx.Response]:
334
+ MAX_RETRIES = 3
335
+ RETRY_DELAY = 2.0
336
+ async with semaphore:
337
+ for attempt in range(MAX_RETRIES):
338
+ try:
339
+ update_response = await client.put(
340
+ f"{BASE_URL}/entities/{existing_entity['id']}",
341
+ json=new_entity,
342
+ params={
343
+ "trigger_webhooks_flag": "true",
344
+ "update_index": "true",
345
+ **({"plugins": plugins} if plugins else {}),
346
+ },
347
+ timeout=60,
348
+ )
349
+ if 200 <= update_response.status_code < 300:
350
+ return (
351
+ new_entity["filepath"],
352
+ FileStatus.UPDATED,
353
+ True,
354
+ update_response,
355
+ )
356
+ else:
357
+ return (
358
+ new_entity["filepath"],
359
+ FileStatus.UPDATED,
360
+ False,
361
+ update_response,
362
+ )
363
+ except Exception as e:
364
+ logging.error(
365
+ f"Error while updating entity {existing_entity['id']} (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
366
+ )
367
+ if attempt < MAX_RETRIES - 1:
368
+ await asyncio.sleep(RETRY_DELAY)
369
+ else:
370
+ return new_entity["filepath"], FileStatus.UPDATED, False, None
371
+
372
+
373
+ @lib_app.command("reindex")
374
+ def reindex(
375
+ library_id: int,
376
+ folders: List[int] = typer.Option(None, "--folder", "-f"),
377
+ force: bool = typer.Option(
378
+ False, "--force", help="Force recreate FTS and vector tables before reindexing"
379
+ ),
380
+ batch_size: int = typer.Option(
381
+ 1, "--batch-size", "-bs", help="Batch size for processing entities"
382
+ ),
383
+ ):
384
+ print(f"Reindexing library {library_id}")
385
+
386
+ from memos.databases.initializers import recreate_fts_and_vec_tables
387
+
388
+ # Get the library
389
+ response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
390
+ if response.status_code != 200:
391
+ print(f"Failed to get library: {response.status_code} - {response.text}")
392
+ return
393
+
394
+ library = response.json()
395
+ scanned_entities = set()
396
+
397
+ # Filter folders if the folders parameter is provided
398
+ if folders:
399
+ library_folders = [
400
+ folder for folder in library["folders"] if folder["id"] in folders
401
+ ]
402
+ else:
403
+ library_folders = library["folders"]
404
+
405
+ if force:
406
+ print("Force flag is set. Recreating FTS and vector tables...")
407
+ if not recreate_fts_and_vec_tables(settings):
408
+ return
409
+ print("FTS and vector tables have been recreated.")
410
+
411
+ with httpx.Client() as client:
412
+ total_entities = 0
413
+
414
+ # Get total entity count for all folders
415
+ for folder in library_folders:
416
+ response = client.get(
417
+ f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
418
+ params={"limit": 1, "offset": 0},
419
+ )
420
+ if response.status_code == 200:
421
+ total_entities += int(response.headers.get("X-Total-Count", 0))
422
+ else:
423
+ print(
424
+ f"Failed to get entity count for folder {folder['id']}: {response.status_code} - {response.text}"
425
+ )
426
+
427
+ # Now process entities with a progress bar
428
+ with tqdm(total=total_entities, desc="Reindexing entities") as pbar:
429
+ for folder in library_folders:
430
+ print(f"Processing folder: {folder['id']}")
431
+
432
+ # List all entities in the folder
433
+ limit = 200
434
+ offset = 0
435
+ while True:
436
+ entities_response = client.get(
437
+ f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
438
+ params={"limit": limit, "offset": offset},
439
+ )
440
+ if entities_response.status_code != 200:
441
+ print(
442
+ f"Failed to get entities: {entities_response.status_code} - {entities_response.text}"
443
+ )
444
+ break
445
+
446
+ entities = entities_response.json()
447
+ if not entities:
448
+ break
449
+
450
+ # Collect entity IDs to be processed
451
+ entity_ids = [
452
+ entity["id"]
453
+ for entity in entities
454
+ if entity["id"] not in scanned_entities
455
+ ]
456
+
457
+ # Process in batches
458
+ for i in range(0, len(entity_ids), batch_size):
459
+ batch_ids = entity_ids[i : i + batch_size]
460
+ if batch_ids:
461
+ batch_response = client.post(
462
+ f"{BASE_URL}/entities/batch-index",
463
+ json={"entity_ids": batch_ids},
464
+ timeout=60,
465
+ )
466
+ if batch_response.status_code != 204:
467
+ print(
468
+ f"Failed to update batch: {batch_response.status_code} - {batch_response.text}"
469
+ )
470
+ pbar.update(len(batch_ids))
471
+ scanned_entities.update(batch_ids)
472
+
473
+ offset += limit
474
+
475
+ if folders:
476
+ print(f"Reindexing completed for library {library_id} with folders: {folders}")
477
+ else:
478
+ print(f"Reindexing completed for library {library_id}")
479
+
480
+
481
+ def has_entity_changes(new_entity: dict, existing_entity: dict) -> bool:
482
+ """
483
+ Compare new_entity with existing_entity to determine if there are actual changes.
484
+ Returns True if there are differences, False otherwise.
485
+ """
486
+ # Compare basic fields
487
+ basic_fields = [
488
+ "filename",
489
+ "filepath",
490
+ "size",
491
+ "file_created_at",
492
+ "file_last_modified_at",
493
+ "file_type",
494
+ "file_type_group",
495
+ ]
496
+
497
+ for field in basic_fields:
498
+ if new_entity.get(field) != existing_entity.get(field):
499
+ return True
500
+
501
+ # Compare metadata entries
502
+ new_metadata = {
503
+ (entry["key"], entry["value"])
504
+ for entry in new_entity.get("metadata_entries", [])
505
+ }
506
+ existing_metadata = {
507
+ (entry["key"], entry["value"])
508
+ for entry in existing_entity.get("metadata_entries", [])
509
+ }
510
+ if new_metadata != existing_metadata:
511
+ return True
512
+
513
+ # Compare tags
514
+ new_tags = set(new_entity.get("tags", []))
515
+ existing_tags = {tag["name"] for tag in existing_entity.get("tags", [])}
516
+ if new_tags != existing_tags:
517
+ return True
518
+
519
+ return False
520
+
521
+
522
+ @lib_app.command("sync")
523
+ def sync(
524
+ library_id: int,
525
+ filepath: str,
526
+ force: bool = typer.Option(
527
+ False, "--force", "-f", help="Force update the file even if it hasn't changed"
528
+ ),
529
+ without_webhooks: bool = typer.Option(
530
+ False, "--no-plugins", help="Disable plugin triggers", is_flag=True
531
+ ),
532
+ ):
533
+ """
534
+ Sync a specific file with the library.
535
+ """
536
+ # 1. Get library by id and check if it exists
537
+ response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
538
+ if response.status_code != 200:
539
+ typer.echo(f"Error: Library with id {library_id} not found.")
540
+ raise typer.Exit(code=1)
541
+
542
+ library = response.json()
543
+
544
+ # Convert filepath to absolute path
545
+ file_path = Path(filepath).resolve()
546
+
547
+ if not file_path.is_file():
548
+ typer.echo(f"Error: File {file_path} does not exist.")
549
+ raise typer.Exit(code=1)
550
+
551
+ # 2. Check if the file exists in the library
552
+ response = httpx.get(
553
+ f"{BASE_URL}/libraries/{library_id}/entities/by-filepath",
554
+ params={"filepath": str(file_path)},
555
+ )
556
+
557
+ file_stat = file_path.stat()
558
+ file_type, file_type_group = get_file_type(file_path)
559
+
560
+ new_entity = {
561
+ "filename": file_path.name,
562
+ "filepath": str(file_path),
563
+ "size": file_stat.st_size,
564
+ "file_created_at": format_timestamp(file_stat.st_ctime),
565
+ "file_last_modified_at": format_timestamp(file_stat.st_mtime),
566
+ "file_type": file_type,
567
+ "file_type_group": file_type_group,
568
+ }
569
+
570
+ # Handle metadata
571
+ is_thumbnail = False
572
+ if file_type_group == "image":
573
+ metadata = get_image_metadata(file_path)
574
+ if metadata:
575
+ if "active_window" in metadata and "active_app" not in metadata:
576
+ metadata["active_app"] = metadata["active_window"].split(" - ")[0]
577
+ new_entity["metadata_entries"] = [
578
+ {
579
+ "key": key,
580
+ "value": str(value),
581
+ "source": MetadataSource.SYSTEM_GENERATED.value,
582
+ "data_type": (
583
+ "number" if isinstance(value, (int, float)) else "text"
584
+ ),
585
+ }
586
+ for key, value in metadata.items()
587
+ if key != IS_THUMBNAIL
588
+ ]
589
+ if "active_app" in metadata:
590
+ new_entity.setdefault("tags", []).append(metadata["active_app"])
591
+ is_thumbnail = metadata.get(IS_THUMBNAIL, False)
592
+
593
+ if is_thumbnail:
594
+ typer.echo(f"Skipping thumbnail file: {file_path}")
595
+ return
596
+
597
+ if response.status_code == 200:
598
+ # File exists, update it
599
+ existing_entity = response.json()
600
+ new_entity["folder_id"] = existing_entity["folder_id"]
601
+
602
+ if is_thumbnail:
603
+ new_entity["file_created_at"] = existing_entity["file_created_at"]
604
+ new_entity["file_last_modified_at"] = existing_entity[
605
+ "file_last_modified_at"
606
+ ]
607
+ new_entity["file_type"] = existing_entity["file_type"]
608
+ new_entity["file_type_group"] = existing_entity["file_type_group"]
609
+ new_entity["size"] = existing_entity["size"]
610
+
611
+ if not force:
612
+ # Merge existing metadata with new metadata
613
+ new_metadata_keys = {
614
+ entry["key"] for entry in new_entity.get("metadata_entries", [])
615
+ }
616
+ for existing_entry in existing_entity.get("metadata_entries", []):
617
+ if existing_entry["key"] not in new_metadata_keys:
618
+ new_entity["metadata_entries"].append(existing_entry)
619
+
620
+ # Merge existing tags with new tags
621
+ existing_tags = {tag["name"] for tag in existing_entity.get("tags", [])}
622
+ new_tags = set(new_entity.get("tags", []))
623
+ merged_tags = new_tags.union(existing_tags)
624
+ new_entity["tags"] = list(merged_tags)
625
+
626
+ # Only update if there are actual changes or force flag is set
627
+ if force or has_entity_changes(new_entity, existing_entity):
628
+ update_response = httpx.put(
629
+ f"{BASE_URL}/entities/{existing_entity['id']}",
630
+ json=new_entity,
631
+ params={
632
+ "trigger_webhooks_flag": str(not without_webhooks).lower(),
633
+ "update_index": "true",
634
+ },
635
+ timeout=60,
636
+ )
637
+ if update_response.status_code == 200:
638
+ typer.echo(f"Updated file: {file_path}")
639
+ else:
640
+ typer.echo(
641
+ f"Error updating file: {update_response.status_code} - {update_response.text}"
642
+ )
643
+ else:
644
+ typer.echo(f"File {file_path} is up to date. No changes detected.")
645
+
646
+ else:
647
+ # 3. File doesn't exist, check if it belongs to a folder in the library
648
+ folder = next(
649
+ (
650
+ folder
651
+ for folder in library["folders"]
652
+ if str(file_path).startswith(folder["path"])
653
+ ),
654
+ None,
655
+ )
656
+
657
+ if folder:
658
+ # Create new entity
659
+ new_entity["folder_id"] = folder["id"]
660
+
661
+ create_response = httpx.post(
662
+ f"{BASE_URL}/libraries/{library_id}/entities",
663
+ json=new_entity,
664
+ params={
665
+ "trigger_webhooks_flag": str(not without_webhooks).lower(),
666
+ "update_index": "true",
667
+ },
668
+ timeout=60,
669
+ )
670
+
671
+ if create_response.status_code == 200:
672
+ typer.echo(f"Created new entity for file: {file_path}")
673
+ else:
674
+ typer.echo(
675
+ f"Error creating entity: {create_response.status_code} - {create_response.text}"
676
+ )
677
+
678
+ else:
679
+ # 4. File doesn't belong to any folder in the library
680
+ typer.echo(
681
+ f"Error: File {file_path} does not belong to any folder in the library."
682
+ )
683
+ raise typer.Exit(code=1)
684
+
685
+
686
+ @lru_cache(maxsize=1)
687
+ def is_on_battery():
688
+ try:
689
+ battery = psutil.sensors_battery()
690
+ return battery is not None and not battery.power_plugged
691
+ except:
692
+ return False # If unable to detect battery status, assume not on battery
693
+
694
+
695
+ # Modify the LibraryFileHandler class
696
+ class LibraryFileHandler(FileSystemEventHandler):
697
+ def __init__(
698
+ self,
699
+ library_id,
700
+ include_files,
701
+ max_workers=2,
702
+ sparsity_factor=3,
703
+ rate_window_size=10,
704
+ processing_interval=12,
705
+ ):
706
+ self.library_id = library_id
707
+ self.include_files = include_files
708
+ self.inode_pattern = re.compile(r"\._.+")
709
+ self.pending_files = defaultdict(lambda: {"timestamp": 0, "last_size": 0})
710
+ self.buffer_time = 2
711
+ self.executor = ThreadPoolExecutor(max_workers=max_workers)
712
+ self.lock = threading.Lock()
713
+
714
+ self.processing_interval = processing_interval
715
+ self.sparsity_factor = sparsity_factor
716
+ self.rate_window_size = rate_window_size
717
+
718
+ self.file_change_intervals = deque(maxlen=rate_window_size)
719
+ self.file_processing_durations = deque(maxlen=rate_window_size)
720
+
721
+ self.file_count = 0
722
+ self.file_submitted = 0
723
+ self.file_synced = 0
724
+ self.file_skipped = 0
725
+ self.logger = logger
726
+
727
+ self.last_battery_check = 0
728
+ self.battery_check_interval = 60 # Check battery status every 60 seconds
729
+
730
+ def handle_event(self, event):
731
+ if not event.is_directory and self.is_valid_file(event.src_path):
732
+ current_time = time.time()
733
+ with self.lock:
734
+ file_info = self.pending_files[event.src_path]
735
+
736
+ if current_time - file_info["timestamp"] > self.buffer_time:
737
+ file_info["timestamp"] = current_time
738
+ self.file_change_intervals.append(current_time)
739
+
740
+ file_info["last_size"] = os.path.getsize(event.src_path)
741
+
742
+ return True
743
+ return False
744
+
745
+ def process_pending_files(self):
746
+ current_time = time.time()
747
+ files_to_process_with_plugins = []
748
+ files_to_process_without_plugins = []
749
+ processed_in_current_loop = 0
750
+ with self.lock:
751
+ for path, file_info in list(self.pending_files.items()):
752
+ if current_time - file_info["timestamp"] > self.buffer_time:
753
+ processed_in_current_loop += 1
754
+ if os.path.exists(path) and os.path.getsize(path) > 0:
755
+ self.file_count += 1
756
+ if self.file_count % self.processing_interval == 0:
757
+ files_to_process_with_plugins.append(path)
758
+ print(
759
+ f"file_count % processing_interval: {self.file_count} % {self.processing_interval} == 0"
760
+ )
761
+ print(f"Picked file for processing with plugins: {path}")
762
+ else:
763
+ files_to_process_without_plugins.append(path)
764
+ self.file_skipped += 1
765
+ del self.pending_files[path]
766
+ elif not os.path.exists(path):
767
+ del self.pending_files[path]
768
+
769
+ # Process files with plugins - these count as submitted
770
+ for path in files_to_process_with_plugins:
771
+ self.executor.submit(self.process_file, path, False)
772
+ self.file_submitted += 1
773
+
774
+ # Process files without plugins - these don't count as submitted
775
+ for path in files_to_process_without_plugins:
776
+ self.executor.submit(self.process_file, path, True)
777
+
778
+ if processed_in_current_loop > 0:
779
+ self.logger.info(
780
+ f"File count: {self.file_count}, Files submitted: {self.file_submitted}, Files synced: {self.file_synced}, Files skipped: {self.file_skipped}"
781
+ )
782
+
783
+ self.update_processing_interval()
784
+
785
+ def process_file(self, path, no_plugins):
786
+ self.logger.debug(f"Processing file: {path} (with plugins: {not no_plugins})")
787
+ start_time = time.time()
788
+ sync(self.library_id, path, without_webhooks=no_plugins)
789
+ end_time = time.time()
790
+ if not no_plugins:
791
+ with self.lock:
792
+ self.file_processing_durations.append(end_time - start_time)
793
+ self.file_synced += 1
794
+
795
+ def update_processing_interval(self):
796
+ min_samples = max(3, self.rate_window_size // 3)
797
+ max_interval = 60 # Maximum allowed interval between events in seconds
798
+
799
+ if (
800
+ len(self.file_change_intervals) >= min_samples
801
+ and len(self.file_processing_durations) >= min_samples
802
+ ):
803
+ # Filter out large time gaps
804
+ filtered_intervals = [
805
+ self.file_change_intervals[i] - self.file_change_intervals[i - 1]
806
+ for i in range(1, len(self.file_change_intervals))
807
+ if self.file_change_intervals[i] - self.file_change_intervals[i - 1]
808
+ <= max_interval
809
+ ]
810
+
811
+ if filtered_intervals:
812
+ avg_change_interval = sum(filtered_intervals) / len(filtered_intervals)
813
+ changes_per_second = (
814
+ 1 / avg_change_interval if avg_change_interval > 0 else 0
815
+ )
816
+ else:
817
+ changes_per_second = 0
818
+
819
+ total_processing_time = sum(self.file_processing_durations)
820
+ processing_per_second = (
821
+ len(self.file_processing_durations) / total_processing_time
822
+ if total_processing_time > 0
823
+ else 0
824
+ )
825
+
826
+ if changes_per_second > 0 and processing_per_second > 0:
827
+ rate = changes_per_second / processing_per_second
828
+ new_processing_interval = max(1, math.ceil(self.sparsity_factor * rate))
829
+
830
+ current_time = time.time()
831
+ if current_time - self.last_battery_check > self.battery_check_interval:
832
+ self.last_battery_check = current_time
833
+ is_on_battery.cache_clear() # Clear the cache to get fresh battery status
834
+ if is_on_battery():
835
+ new_processing_interval *= 2
836
+ self.logger.info(
837
+ "Running on battery, doubling the processing interval."
838
+ )
839
+
840
+ if new_processing_interval != self.processing_interval:
841
+ old_processing_interval = self.processing_interval
842
+ self.processing_interval = new_processing_interval
843
+ self.logger.info(
844
+ f"Processing interval: {old_processing_interval} -> {self.processing_interval}, Changes: {changes_per_second:.2f}it/s, Processing: {processing_per_second:.2f}it/s, Rate (changes/processing): {rate:.2f}"
845
+ )
846
+
847
+ def is_valid_file(self, path):
848
+ filename = os.path.basename(path)
849
+ return (
850
+ any(path.lower().endswith(ext) for ext in self.include_files)
851
+ and not is_temp_file(filename)
852
+ and not self.inode_pattern.match(filename)
853
+ )
854
+
855
+ def on_created(self, event):
856
+ self.handle_event(event)
857
+
858
+ def on_modified(self, event):
859
+ self.handle_event(event)
860
+
861
+ def on_moved(self, event):
862
+ if self.handle_event(event):
863
+ # For moved events, we need to update the key in pending_files
864
+ with self.lock:
865
+ self.pending_files[event.dest_path] = self.pending_files.pop(
866
+ event.src_path, {"timestamp": time.time(), "last_size": 0}
867
+ )
868
+
869
+ def on_deleted(self, event):
870
+ if self.is_valid_file(event.src_path):
871
+ self.logger.info(f"File deleted: {event.src_path}")
872
+ # Remove from pending files if it was there
873
+ with self.lock:
874
+ self.pending_files.pop(event.src_path, None)
875
+ # Add logic for handling deleted files if needed
876
+
877
+
878
+ @lib_app.command("watch")
879
+ def watch(
880
+ library_id: int,
881
+ folders: List[int] = typer.Option(
882
+ None, "--folder", "-f", help="Specify folders to watch"
883
+ ),
884
+ sparsity_factor: float = typer.Option(
885
+ 3.0, "--sparsity-factor", "-sf", help="Sparsity factor for file processing"
886
+ ),
887
+ processing_interval: int = typer.Option(
888
+ 12,
889
+ "--processing-interval",
890
+ "-pi",
891
+ help="Process one file with plugins for every N files (higher means less frequent processing)",
892
+ ),
893
+ rate_window_size: int = typer.Option(
894
+ 10,
895
+ "--rate-window",
896
+ "-rw",
897
+ help="Number of recent events to consider when calculating processing rates",
898
+ ),
899
+ verbose: bool = typer.Option(
900
+ False, "--verbose", "-v", help="Enable verbose logging"
901
+ ),
902
+ ):
903
+ """
904
+ Watch for file changes in the library folders and sync automatically.
905
+ """
906
+ # Set the logging level based on the verbose flag
907
+ log_level = "DEBUG" if verbose else "INFO"
908
+ logger.setLevel(log_level)
909
+
910
+ logger.info(f"Watching library {library_id} for changes...")
911
+
912
+ # Get the library
913
+ response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
914
+ if response.status_code != 200:
915
+ print(f"Error: Library with id {library_id} not found.")
916
+ raise typer.Exit(code=1)
917
+
918
+ library = response.json()
919
+
920
+ # Filter folders if the folders parameter is provided
921
+ if folders:
922
+ library_folders = [
923
+ folder for folder in library["folders"] if folder["id"] in folders
924
+ ]
925
+ else:
926
+ library_folders = library["folders"]
927
+
928
+ if not library_folders:
929
+ print("No folders to watch.")
930
+ return
931
+
932
+ # Create an observer and handler for each folder in the library
933
+ observer = Observer()
934
+ handlers = []
935
+ for folder in library_folders:
936
+ folder_path = Path(folder["path"])
937
+ event_handler = LibraryFileHandler(
938
+ library_id,
939
+ include_files,
940
+ sparsity_factor=sparsity_factor,
941
+ processing_interval=processing_interval,
942
+ rate_window_size=rate_window_size,
943
+ )
944
+ handlers.append(event_handler)
945
+ observer.schedule(event_handler, str(folder_path), recursive=True)
946
+ print(f"Watching folder: {folder_path}")
947
+
948
+ observer.start()
949
+ try:
950
+ while True:
951
+ time.sleep(5)
952
+ for handler in handlers:
953
+ handler.process_pending_files()
954
+ except KeyboardInterrupt:
955
+ observer.stop()
956
+ for handler in handlers:
957
+ handler.executor.shutdown(wait=True)
958
+ observer.join()
959
+
960
+
961
+ async def collect_candidate_files(folder_path: Path) -> List[str]:
962
+ """
963
+ Collect candidate files to be processed
964
+
965
+ Args:
966
+ folder_path: Folder path
967
+
968
+ Returns:
969
+ List[str]: List of candidate file paths
970
+ """
971
+ candidate_files = []
972
+ for root, _, files in os.walk(folder_path):
973
+ with tqdm(total=len(files), desc=f"Scanning {root}", leave=True) as pbar:
974
+ for file in files:
975
+ file_path = Path(root) / file
976
+ absolute_file_path = file_path.resolve()
977
+
978
+ # Check if the file extension is in the include_files list and is not a temporary file
979
+ if file_path.suffix.lower() in include_files and not is_temp_file(file):
980
+ candidate_files.append(str(absolute_file_path))
981
+ pbar.update(1)
982
+
983
+ return candidate_files
984
+
985
+
986
+ async def prepare_entity(file_path: str, folder_id: int) -> Dict[str, Any]:
987
+ """
988
+ Prepare entity data
989
+
990
+ Args:
991
+ file_path: File path
992
+ folder_id: Folder ID
993
+
994
+ Returns:
995
+ Dict[str, Any]: Entity data
996
+ """
997
+ file_path = Path(file_path)
998
+ file_stat = file_path.stat()
999
+ file_type, file_type_group = get_file_type(file_path)
1000
+
1001
+ new_entity = {
1002
+ "filename": file_path.name,
1003
+ "filepath": str(file_path),
1004
+ "size": file_stat.st_size,
1005
+ "file_created_at": format_timestamp(file_stat.st_ctime),
1006
+ "file_last_modified_at": format_timestamp(file_stat.st_mtime),
1007
+ "file_type": file_type,
1008
+ "file_type_group": file_type_group,
1009
+ "folder_id": folder_id,
1010
+ }
1011
+
1012
+ # Handle image metadata
1013
+ is_thumbnail = False
1014
+ if file_type_group == "image":
1015
+ metadata = get_image_metadata(file_path)
1016
+ if metadata:
1017
+ if "active_window" in metadata and "active_app" not in metadata:
1018
+ metadata["active_app"] = metadata["active_window"].split(" - ")[0]
1019
+ new_entity["metadata_entries"] = [
1020
+ {
1021
+ "key": key,
1022
+ "value": str(value),
1023
+ "source": MetadataSource.SYSTEM_GENERATED.value,
1024
+ "data_type": (
1025
+ "number" if isinstance(value, (int, float)) else "text"
1026
+ ),
1027
+ }
1028
+ for key, value in metadata.items()
1029
+ if key != IS_THUMBNAIL
1030
+ ]
1031
+ if "active_app" in metadata:
1032
+ new_entity.setdefault("tags", []).append(metadata["active_app"])
1033
+ is_thumbnail = metadata.get(IS_THUMBNAIL, False)
1034
+
1035
+ new_entity["is_thumbnail"] = is_thumbnail
1036
+ return new_entity
1037
+
1038
+
1039
+ def format_error_message(
1040
+ file_status: FileStatus, response: Optional[httpx.Response]
1041
+ ) -> str:
1042
+ """
1043
+ Format error message
1044
+
1045
+ Args:
1046
+ file_status: File status
1047
+ response: HTTP response
1048
+
1049
+ Returns:
1050
+ str: Formatted error message
1051
+ """
1052
+ action = "add" if file_status == FileStatus.ADDED else "update"
1053
+ error_message = f"Failed to {action} file"
1054
+
1055
+ if response:
1056
+ if hasattr(response, "status_code"):
1057
+ error_message += f": {response.status_code}"
1058
+ if hasattr(response, "text"):
1059
+ error_message += f" - {response.text}"
1060
+ else:
1061
+ error_message += " - Unknown error occurred"
1062
+
1063
+ return error_message
1064
+
1065
+
1066
+ async def process_file_batches(
1067
+ client: httpx.AsyncClient,
1068
+ library: dict,
1069
+ folder: dict,
1070
+ candidate_files: list,
1071
+ force: bool,
1072
+ plugins: list,
1073
+ semaphore: asyncio.Semaphore,
1074
+ ) -> Tuple[int, int]:
1075
+ """
1076
+ Process file batches
1077
+
1078
+ Args:
1079
+ client: httpx async client
1080
+ library: Library object
1081
+ folder: Folder information
1082
+ candidate_files: List of candidate files
1083
+ force: Whether to force update
1084
+ plugins: List of plugins
1085
+ semaphore: Concurrency control semaphore
1086
+
1087
+ Returns:
1088
+ Tuple[int, int]: (Number of files added, Number of files updated)
1089
+ """
1090
+ added_file_count = 0
1091
+ updated_file_count = 0
1092
+ batching = 50
1093
+
1094
+ library_id = library.get('id')
1095
+ library_plugins = [plugin.get('id') for plugin in library.get('plugins', [])]
1096
+ target_plugins = library_plugins if plugins is None else [plugin for plugin in library_plugins if plugin in plugins]
1097
+
1098
+ with tqdm(total=len(candidate_files), desc="Processing files", leave=True) as pbar:
1099
+ for i in range(0, len(candidate_files), batching):
1100
+ batch = candidate_files[i : i + batching]
1101
+
1102
+ # Get existing entities in the batch
1103
+ get_response = await client.post(
1104
+ f"{BASE_URL}/libraries/{library_id}/entities/by-filepaths",
1105
+ json=batch,
1106
+ )
1107
+
1108
+ if get_response.status_code != 200:
1109
+ print(
1110
+ f"Failed to get entities: {get_response.status_code} - {get_response.text}"
1111
+ )
1112
+ pbar.update(len(batch))
1113
+ continue
1114
+
1115
+ existing_entities = get_response.json()
1116
+ existing_entities_dict = {
1117
+ entity["filepath"]: entity for entity in existing_entities
1118
+ }
1119
+
1120
+ # Process each file
1121
+ tasks = []
1122
+ for file_path in batch:
1123
+ new_entity = await prepare_entity(file_path, folder["id"])
1124
+
1125
+ if new_entity.get("is_thumbnail", False):
1126
+ typer.echo(f"Skipping thumbnail file: {file_path}")
1127
+ continue
1128
+
1129
+ existing_entity = existing_entities_dict.get(str(file_path))
1130
+ if existing_entity:
1131
+ if force:
1132
+ # Directly update without merging if force is true
1133
+ tasks.append(
1134
+ update_entity(
1135
+ client, semaphore, plugins, new_entity, existing_entity
1136
+ )
1137
+ )
1138
+ else:
1139
+ # Merge existing metadata with new metadata
1140
+ new_metadata_keys = {
1141
+ entry["key"]
1142
+ for entry in new_entity.get("metadata_entries", [])
1143
+ }
1144
+ for existing_entry in existing_entity.get(
1145
+ "metadata_entries", []
1146
+ ):
1147
+ if existing_entry["key"] not in new_metadata_keys:
1148
+ new_entity.setdefault("metadata_entries", []).append(
1149
+ existing_entry
1150
+ )
1151
+
1152
+ # Merge existing tags with new tags
1153
+ existing_tags = {
1154
+ tag["name"] for tag in existing_entity.get("tags", [])
1155
+ }
1156
+ new_tags = set(new_entity.get("tags", []))
1157
+ merged_tags = new_tags.union(existing_tags)
1158
+ new_entity["tags"] = list(merged_tags)
1159
+
1160
+ # Check if the entity needs to be processed by any plugins
1161
+ processed_plugins = {plugin_status.get("plugin_id") for plugin_status in existing_entity.get("plugin_status", [])}
1162
+ has_unprocessed_plugins = any(plugin_id not in processed_plugins for plugin_id in target_plugins)
1163
+
1164
+ # Only update if there are actual changes or the entity needs to be processed by any plugins
1165
+ if has_unprocessed_plugins or has_entity_changes(new_entity, existing_entity):
1166
+ tasks.append(
1167
+ update_entity(
1168
+ client, semaphore, plugins, new_entity, existing_entity
1169
+ )
1170
+ )
1171
+ else:
1172
+ pbar.write(f"Skipping file: {file_path} #{existing_entity.get('id')}")
1173
+ pbar.update(1)
1174
+ continue
1175
+ else:
1176
+ tasks.append(
1177
+ add_entity(client, semaphore, library_id, plugins, new_entity)
1178
+ )
1179
+
1180
+ # Process task results
1181
+ if tasks:
1182
+ for future in asyncio.as_completed(tasks):
1183
+ file_path, file_status, succeeded, response = await future
1184
+ if succeeded:
1185
+ if file_status == FileStatus.ADDED:
1186
+ added_file_count += 1
1187
+ tqdm.write(f"Added file to library: {file_path}")
1188
+ else:
1189
+ updated_file_count += 1
1190
+ tqdm.write(f"Updated file in library: {file_path}")
1191
+ else:
1192
+ error_message = format_error_message(file_status, response)
1193
+ tqdm.write(error_message)
1194
+
1195
+ # Update progress bar for each file processed
1196
+ pbar.update(1)
1197
+ pbar.set_postfix(
1198
+ {"Added": added_file_count, "Updated": updated_file_count},
1199
+ refresh=True,
1200
+ )
1201
+
1202
+ return added_file_count, updated_file_count
1203
+
1204
+
1205
+ async def check_deleted_files(
1206
+ client: httpx.AsyncClient,
1207
+ library_id: int,
1208
+ folder: dict,
1209
+ folder_path: Path,
1210
+ scanned_files: Set[str],
1211
+ ) -> int:
1212
+ """
1213
+ Check and handle deleted files
1214
+
1215
+ Args:
1216
+ client: httpx async client
1217
+ library_id: Library ID
1218
+ folder: Folder information
1219
+ folder_path: Folder path
1220
+ scanned_files: Set of scanned files
1221
+
1222
+ Returns:
1223
+ int: Number of deleted files
1224
+ """
1225
+ deleted_count = 0
1226
+ limit = 100
1227
+ offset = 0
1228
+ total_entities = 0
1229
+
1230
+ with tqdm(
1231
+ total=total_entities, desc="Checking for deleted files", leave=True
1232
+ ) as pbar:
1233
+ while True:
1234
+ # Add path_prefix parameter to only get entities under the folder_path
1235
+ existing_files_response = await client.get(
1236
+ f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
1237
+ params={
1238
+ "limit": limit,
1239
+ "offset": offset,
1240
+ "path_prefix": str(folder_path),
1241
+ },
1242
+ timeout=60,
1243
+ )
1244
+
1245
+ if existing_files_response.status_code != 200:
1246
+ pbar.write(
1247
+ f"Failed to retrieve existing files: {existing_files_response.status_code} - {existing_files_response.text}"
1248
+ )
1249
+ break
1250
+
1251
+ existing_files = existing_files_response.json()
1252
+ if not existing_files:
1253
+ break
1254
+
1255
+ # Update total count (if this is the first request)
1256
+ if offset == 0:
1257
+ total_entities = int(
1258
+ existing_files_response.headers.get("X-Total-Count", total_entities)
1259
+ )
1260
+ pbar.total = total_entities
1261
+ pbar.refresh()
1262
+
1263
+ for existing_file in existing_files:
1264
+ if (
1265
+ # path_prefix may include files not in the folder_path,
1266
+ # for example when folder_path is 20241101 but there is another folder 20241101-copy
1267
+ # so check the existing_file is relative_to folder_path is required,
1268
+ # do not remove this.
1269
+ Path(existing_file["filepath"]).is_relative_to(folder_path)
1270
+ and existing_file["filepath"] not in scanned_files
1271
+ ):
1272
+ # File has been deleted
1273
+ delete_response = await client.delete(
1274
+ f"{BASE_URL}/libraries/{library_id}/entities/{existing_file['id']}"
1275
+ )
1276
+ if 200 <= delete_response.status_code < 300:
1277
+ pbar.write(
1278
+ f"Deleted file from library: {existing_file['filepath']}"
1279
+ )
1280
+ deleted_count += 1
1281
+ else:
1282
+ pbar.write(
1283
+ f"Failed to delete file: {delete_response.status_code} - {delete_response.text}"
1284
+ )
1285
+ pbar.update(1)
1286
+
1287
+ offset += limit
1288
+
1289
+ return deleted_count