pensiev 0.25.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. memos/__init__.py +6 -0
  2. memos/cmds/__init__.py +0 -0
  3. memos/cmds/library.py +1289 -0
  4. memos/cmds/plugin.py +96 -0
  5. memos/commands.py +865 -0
  6. memos/config.py +225 -0
  7. memos/crud.py +605 -0
  8. memos/databases/__init__.py +0 -0
  9. memos/databases/initializers.py +481 -0
  10. memos/dataset_extractor_for_florence.py +165 -0
  11. memos/dataset_extractor_for_internvl2.py +192 -0
  12. memos/default_config.yaml +88 -0
  13. memos/embedding.py +129 -0
  14. memos/frame_extractor.py +53 -0
  15. memos/logging_config.py +35 -0
  16. memos/main.py +104 -0
  17. memos/migrations/alembic/README +1 -0
  18. memos/migrations/alembic/__pycache__/env.cpython-310.pyc +0 -0
  19. memos/migrations/alembic/env.py +108 -0
  20. memos/migrations/alembic/script.py.mako +30 -0
  21. memos/migrations/alembic/versions/00904ac8c6fc_add_indexes_to_entitymodel.py +63 -0
  22. memos/migrations/alembic/versions/04acdaf75664_add_indices_to_entitytags_and_metadata.py +86 -0
  23. memos/migrations/alembic/versions/12504c5b1d3c_add_extra_columns_for_embedding.py +67 -0
  24. memos/migrations/alembic/versions/31a1ad0e10b3_add_entity_plugin_status.py +71 -0
  25. memos/migrations/alembic/versions/__pycache__/00904ac8c6fc_add_indexes_to_entitymodel.cpython-310.pyc +0 -0
  26. memos/migrations/alembic/versions/__pycache__/04acdaf75664_add_indices_to_entitytags_and_metadata.cpython-310.pyc +0 -0
  27. memos/migrations/alembic/versions/__pycache__/12504c5b1d3c_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
  28. memos/migrations/alembic/versions/__pycache__/20f5ecab014d_add_entity_plugin_status.cpython-310.pyc +0 -0
  29. memos/migrations/alembic/versions/__pycache__/31a1ad0e10b3_add_entity_plugin_status.cpython-310.pyc +0 -0
  30. memos/migrations/alembic/versions/__pycache__/4fcb062c5128_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
  31. memos/migrations/alembic/versions/__pycache__/d10c55fbb7d2_add_index_for_entity_file_type_group_.cpython-310.pyc +0 -0
  32. memos/migrations/alembic/versions/__pycache__/f8f158182416_add_active_app_index.cpython-310.pyc +0 -0
  33. memos/migrations/alembic/versions/d10c55fbb7d2_add_index_for_entity_file_type_group_.py +44 -0
  34. memos/migrations/alembic/versions/f8f158182416_add_active_app_index.py +75 -0
  35. memos/migrations/alembic.ini +116 -0
  36. memos/migrations.py +19 -0
  37. memos/models.py +199 -0
  38. memos/plugins/__init__.py +0 -0
  39. memos/plugins/ocr/__init__.py +0 -0
  40. memos/plugins/ocr/main.py +251 -0
  41. memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx +0 -0
  42. memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx +0 -0
  43. memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx +0 -0
  44. memos/plugins/ocr/ppocr-gpu.yaml +43 -0
  45. memos/plugins/ocr/ppocr.yaml +44 -0
  46. memos/plugins/ocr/server.py +227 -0
  47. memos/plugins/ocr/temp_ppocr.yaml +42 -0
  48. memos/plugins/vlm/__init__.py +0 -0
  49. memos/plugins/vlm/main.py +251 -0
  50. memos/prepare_dataset.py +107 -0
  51. memos/process_webp.py +55 -0
  52. memos/read_metadata.py +32 -0
  53. memos/record.py +358 -0
  54. memos/schemas.py +289 -0
  55. memos/search.py +1198 -0
  56. memos/server.py +883 -0
  57. memos/shotsum.py +105 -0
  58. memos/shotsum_with_ocr.py +145 -0
  59. memos/simple_tokenizer/dict/README.md +31 -0
  60. memos/simple_tokenizer/dict/hmm_model.utf8 +34 -0
  61. memos/simple_tokenizer/dict/idf.utf8 +258826 -0
  62. memos/simple_tokenizer/dict/jieba.dict.utf8 +348982 -0
  63. memos/simple_tokenizer/dict/pos_dict/char_state_tab.utf8 +6653 -0
  64. memos/simple_tokenizer/dict/pos_dict/prob_emit.utf8 +166 -0
  65. memos/simple_tokenizer/dict/pos_dict/prob_start.utf8 +259 -0
  66. memos/simple_tokenizer/dict/pos_dict/prob_trans.utf8 +5222 -0
  67. memos/simple_tokenizer/dict/stop_words.utf8 +1534 -0
  68. memos/simple_tokenizer/dict/user.dict.utf8 +4 -0
  69. memos/simple_tokenizer/linux/libsimple.so +0 -0
  70. memos/simple_tokenizer/macos/libsimple.dylib +0 -0
  71. memos/simple_tokenizer/windows/simple.dll +0 -0
  72. memos/static/_app/immutable/assets/0.e250c031.css +1 -0
  73. memos/static/_app/immutable/assets/_layout.e7937cfe.css +1 -0
  74. memos/static/_app/immutable/chunks/index.5c08976b.js +1 -0
  75. memos/static/_app/immutable/chunks/index.60ee613b.js +4 -0
  76. memos/static/_app/immutable/chunks/runtime.a7926cf6.js +5 -0
  77. memos/static/_app/immutable/chunks/scheduler.5c1cff6e.js +1 -0
  78. memos/static/_app/immutable/chunks/singletons.583bdf4e.js +1 -0
  79. memos/static/_app/immutable/entry/app.666c1643.js +1 -0
  80. memos/static/_app/immutable/entry/start.aed5c701.js +3 -0
  81. memos/static/_app/immutable/nodes/0.5862ea38.js +7 -0
  82. memos/static/_app/immutable/nodes/1.35378a5e.js +1 -0
  83. memos/static/_app/immutable/nodes/2.1ccf9ea5.js +81 -0
  84. memos/static/_app/version.json +1 -0
  85. memos/static/app.html +36 -0
  86. memos/static/favicon.png +0 -0
  87. memos/static/logos/memos_logo_1024.png +0 -0
  88. memos/static/logos/memos_logo_1024@2x.png +0 -0
  89. memos/static/logos/memos_logo_128.png +0 -0
  90. memos/static/logos/memos_logo_128@2x.png +0 -0
  91. memos/static/logos/memos_logo_16.png +0 -0
  92. memos/static/logos/memos_logo_16@2x.png +0 -0
  93. memos/static/logos/memos_logo_256.png +0 -0
  94. memos/static/logos/memos_logo_256@2x.png +0 -0
  95. memos/static/logos/memos_logo_32.png +0 -0
  96. memos/static/logos/memos_logo_32@2x.png +0 -0
  97. memos/static/logos/memos_logo_512.png +0 -0
  98. memos/static/logos/memos_logo_512@2x.png +0 -0
  99. memos/static/logos/memos_logo_64.png +0 -0
  100. memos/static/logos/memos_logo_64@2x.png +0 -0
  101. memos/test_server.py +802 -0
  102. memos/utils.py +49 -0
  103. memos_ml_backends/florence2_server.py +176 -0
  104. memos_ml_backends/qwen2vl_server.py +182 -0
  105. memos_ml_backends/schemas.py +48 -0
  106. pensiev-0.25.5.dist-info/LICENSE +201 -0
  107. pensiev-0.25.5.dist-info/METADATA +541 -0
  108. pensiev-0.25.5.dist-info/RECORD +111 -0
  109. pensiev-0.25.5.dist-info/WHEEL +5 -0
  110. pensiev-0.25.5.dist-info/entry_points.txt +2 -0
  111. pensiev-0.25.5.dist-info/top_level.txt +2 -0
@@ -0,0 +1,481 @@
1
+ """Database initializer classes for different database backends."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from sqlalchemy import create_engine, event, text
6
+ from sqlalchemy.exc import OperationalError
7
+ from sqlalchemy.orm import sessionmaker
8
+ import sqlite_vec
9
+
10
+ from ..models import RawBase, PluginModel, LibraryModel, LibraryPluginModel
11
+
12
+
13
+ def setup_database(settings, **engine_kwargs):
14
+ """Set up and initialize the database.
15
+
16
+ Args:
17
+ settings: Application settings containing database configuration
18
+ **engine_kwargs: Additional keyword arguments to pass to create_engine
19
+
20
+ Returns:
21
+ tuple: (engine, initializer) where engine is the SQLAlchemy engine and
22
+ initializer is the appropriate DatabaseInitializer instance
23
+ """
24
+ engine, initializer = create_db_initializer(settings, **engine_kwargs)
25
+ initializer.init_database()
26
+ return engine, initializer
27
+
28
+
29
+ def init_database(settings):
30
+ """Initialize the database."""
31
+ engine, initializer = create_db_initializer(settings)
32
+ return initializer.init_database()
33
+
34
+
35
+ def recreate_fts_and_vec_tables(settings):
36
+ """Recreate the database-specific tables without repopulating data."""
37
+ engine, initializer = create_db_initializer(settings)
38
+ return initializer.recreate_index_tables()
39
+
40
+
41
+ def initialize_default_plugins(session):
42
+ """Initialize default plugins in the database."""
43
+ default_plugins = [
44
+ PluginModel(
45
+ name="builtin_vlm", description="VLM Plugin", webhook_url="/plugins/vlm"
46
+ ),
47
+ PluginModel(
48
+ name="builtin_ocr", description="OCR Plugin", webhook_url="/plugins/ocr"
49
+ ),
50
+ ]
51
+
52
+ for plugin in default_plugins:
53
+ existing_plugin = session.query(PluginModel).filter_by(name=plugin.name).first()
54
+ if not existing_plugin:
55
+ session.add(plugin)
56
+
57
+ session.commit()
58
+
59
+ return default_plugins
60
+
61
+
62
+ def init_default_libraries(session, default_plugins, settings):
63
+ """Initialize default libraries and associate them with plugins."""
64
+ default_libraries = [
65
+ LibraryModel(name=settings.default_library),
66
+ ]
67
+
68
+ for library in default_libraries:
69
+ existing_library = (
70
+ session.query(LibraryModel).filter_by(name=library.name).first()
71
+ )
72
+ if not existing_library:
73
+ session.add(library)
74
+
75
+ for plugin in default_plugins:
76
+ bind_response = session.query(PluginModel).filter_by(name=plugin.name).first()
77
+ if bind_response:
78
+ # Check if the LibraryPluginModel already exists
79
+ existing_library_plugin = (
80
+ session.query(LibraryPluginModel)
81
+ .filter_by(library_id=1, plugin_id=bind_response.id)
82
+ .first()
83
+ )
84
+
85
+ if not existing_library_plugin:
86
+ library_plugin = LibraryPluginModel(
87
+ library_id=1, plugin_id=bind_response.id
88
+ ) # Assuming library_id=1 for default libraries
89
+ session.add(library_plugin)
90
+
91
+ session.commit()
92
+
93
+
94
+ def create_db_initializer(settings, **engine_kwargs):
95
+ """Create a database engine and initializer based on settings.
96
+
97
+ Args:
98
+ settings: Application settings containing database configuration
99
+ **engine_kwargs: Additional keyword arguments to pass to create_engine
100
+
101
+ Returns:
102
+ tuple: (engine, initializer) where engine is the SQLAlchemy engine and
103
+ initializer is the appropriate DatabaseInitializer instance
104
+ """
105
+ default_engine_kwargs = {
106
+ "pool_size": 10,
107
+ "max_overflow": 20,
108
+ "pool_timeout": 60,
109
+ "pool_recycle": 3600,
110
+ }
111
+
112
+ if settings.is_sqlite:
113
+ default_engine_kwargs["connect_args"] = {"timeout": 60}
114
+
115
+ # Override defaults with any provided kwargs
116
+ default_engine_kwargs.update(engine_kwargs)
117
+
118
+ engine = create_engine(
119
+ settings.database_url,
120
+ **default_engine_kwargs
121
+ )
122
+
123
+ # Create the appropriate initializer based on database type
124
+ if settings.is_sqlite:
125
+ initializer = SQLiteInitializer(engine, settings)
126
+ else:
127
+ print("Using PostgreSQL")
128
+ initializer = PostgreSQLInitializer(engine, settings)
129
+
130
+ return engine, initializer
131
+
132
+
133
+ class DatabaseInitializer:
134
+ """Base class for database initialization."""
135
+ def __init__(self, engine, settings):
136
+ self.engine = engine
137
+ self.settings = settings
138
+
139
+ def init_database(self) -> bool:
140
+ """Initialize the database with common tables and data."""
141
+ try:
142
+ # Create all tables defined in SQLAlchemy models
143
+ RawBase.metadata.create_all(self.engine)
144
+ print(f"Database initialized successfully at {self.settings.database_url}")
145
+
146
+ # Initialize database-specific features
147
+ self.init_specific_features()
148
+
149
+ # Initialize default data
150
+ Session = sessionmaker(bind=self.engine)
151
+ with Session() as session:
152
+ default_plugins = initialize_default_plugins(session)
153
+ init_default_libraries(session, default_plugins, self.settings)
154
+
155
+ return True
156
+ except OperationalError as e:
157
+ print(f"Error initializing database: {e}")
158
+ return False
159
+
160
+ def init_extensions(self):
161
+ """Initialize database extensions. To be implemented by subclasses."""
162
+ pass
163
+
164
+ def init_specific_features(self):
165
+ """Initialize database-specific features. To be implemented by subclasses."""
166
+ pass
167
+
168
+ def recreate_index_tables(self) -> bool:
169
+ """Recreate database-specific index tables. To be implemented by subclasses."""
170
+ pass
171
+
172
+
173
+ class SQLiteInitializer(DatabaseInitializer):
174
+ """SQLite-specific database initializer."""
175
+ def __init__(self, engine, settings):
176
+ super().__init__(engine, settings)
177
+ self.init_extensions()
178
+
179
+ def init_extensions(self):
180
+ """Initialize SQLite-specific extensions."""
181
+ event.listen(self.engine, "connect", self._load_sqlite_extensions)
182
+
183
+ def _load_sqlite_extensions(self, dbapi_conn, connection_record):
184
+ """Load SQLite extensions for full-text search and vector operations."""
185
+ try:
186
+ dbapi_conn.enable_load_extension(True)
187
+ except AttributeError as e:
188
+ print("Error: Current SQLite3 build doesn't support loading extensions.")
189
+ print("\nRecommended solutions:")
190
+ print("1. Install Python using Conda (recommended for both Windows and macOS):")
191
+ print(" conda create -n yourenv python")
192
+ print(" conda activate yourenv")
193
+ print("\n2. Or on macOS, you can use Homebrew:")
194
+ print(" brew install python")
195
+ print(f"\nDetailed error: {str(e)}")
196
+ raise
197
+
198
+ # load simple tokenizer
199
+ current_dir = Path(__file__).parent.parent.resolve()
200
+ if sys.platform.startswith("linux"):
201
+ lib_path = current_dir / "simple_tokenizer" / "linux" / "libsimple"
202
+ elif sys.platform == "win32":
203
+ lib_path = current_dir / "simple_tokenizer" / "windows" / "simple"
204
+ elif sys.platform == "darwin":
205
+ lib_path = current_dir / "simple_tokenizer" / "macos" / "libsimple"
206
+ else:
207
+ raise OSError(f"Unsupported operating system: {sys.platform}")
208
+
209
+ dbapi_conn.load_extension(str(lib_path))
210
+ dict_path = current_dir / "simple_tokenizer" / "dict"
211
+ dbapi_conn.execute(f"SELECT jieba_dict('{dict_path}')")
212
+
213
+ # load vector ext
214
+ sqlite_vec.load(dbapi_conn)
215
+
216
+ # Set WAL mode after loading extensions
217
+ dbapi_conn.execute("PRAGMA journal_mode=WAL")
218
+
219
+ def init_specific_features(self):
220
+ """Initialize SQLite-specific features like FTS and vector extensions."""
221
+ # Create FTS and Vec tables
222
+ with self.engine.connect() as conn:
223
+ conn.execute(
224
+ text(
225
+ """
226
+ CREATE VIRTUAL TABLE IF NOT EXISTS entities_fts USING fts5(
227
+ id, filepath, tags, metadata,
228
+ tokenize = 'simple 0',
229
+ prefix = '2 3 4'
230
+ )
231
+ """
232
+ )
233
+ )
234
+
235
+ conn.execute(
236
+ text(
237
+ f"""
238
+ CREATE VIRTUAL TABLE IF NOT EXISTS entities_vec_v2 USING vec0(
239
+ embedding float[{self.settings.embedding.num_dim}] distance_metric=cosine,
240
+ file_type_group text,
241
+ created_at_timestamp integer,
242
+ file_created_at_timestamp integer,
243
+ file_created_at_date text partition key,
244
+ app_name text,
245
+ library_id integer
246
+ )
247
+ """
248
+ )
249
+ )
250
+
251
+ def recreate_index_tables(self) -> bool:
252
+ """Recreate SQLite-specific index tables (FTS and vector tables)."""
253
+ Session = sessionmaker(bind=self.engine)
254
+
255
+ with Session() as session:
256
+ try:
257
+ # Drop existing tables
258
+ session.execute(text("DROP TABLE IF EXISTS entities_fts"))
259
+ session.execute(text("DROP TABLE IF EXISTS entities_vec_v2"))
260
+
261
+ # Recreate entities_fts table
262
+ session.execute(
263
+ text(
264
+ """
265
+ CREATE VIRTUAL TABLE entities_fts USING fts5(
266
+ id, filepath, tags, metadata,
267
+ tokenize = 'simple 0',
268
+ prefix = '2 3 4'
269
+ )
270
+ """
271
+ )
272
+ )
273
+
274
+ # Recreate entities_vec_v2 table
275
+ session.execute(
276
+ text(
277
+ f"""
278
+ CREATE VIRTUAL TABLE entities_vec_v2 USING vec0(
279
+ embedding float[{self.settings.embedding.num_dim}] distance_metric=cosine,
280
+ file_type_group text,
281
+ created_at_timestamp integer,
282
+ file_created_at_timestamp integer,
283
+ file_created_at_date text partition key,
284
+ app_name text,
285
+ library_id integer
286
+ )
287
+ """
288
+ )
289
+ )
290
+
291
+ session.commit()
292
+ print("Successfully recreated entities_fts and entities_vec_v2 tables.")
293
+ return True
294
+ except Exception as e:
295
+ session.rollback()
296
+ print(f"Error recreating tables: {e}")
297
+ return False
298
+
299
+
300
+ class PostgreSQLInitializer(DatabaseInitializer):
301
+ def __init__(self, engine, settings):
302
+ super().__init__(engine, settings)
303
+ self.init_extensions()
304
+
305
+ """PostgreSQL-specific database initializer."""
306
+ def init_extensions(self):
307
+ """Initialize PostgreSQL-specific extensions."""
308
+ with self.engine.connect() as conn:
309
+ # Create extensions in a separate transaction
310
+ conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
311
+ conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
312
+ conn.commit()
313
+
314
+ def init_specific_features(self):
315
+ """Initialize PostgreSQL-specific features."""
316
+ with self.engine.connect() as conn:
317
+ # Create the tsvector column and index for full-text search
318
+ conn.execute(
319
+ text(
320
+ f"""
321
+ -- Create a table to store the full-text search data
322
+ CREATE TABLE IF NOT EXISTS entities_fts (
323
+ id INTEGER PRIMARY KEY,
324
+ filepath TEXT,
325
+ tags TEXT,
326
+ metadata TEXT,
327
+ search_vector tsvector GENERATED ALWAYS AS (
328
+ setweight(to_tsvector('simple', coalesce(filepath, '')), 'A') ||
329
+ setweight(to_tsvector('simple', coalesce(tags, '')), 'B') ||
330
+ setweight(to_tsvector('simple', coalesce(metadata, '')), 'C')
331
+ ) STORED,
332
+ -- Add raw text columns for prefix/substring search
333
+ search_text TEXT GENERATED ALWAYS AS (
334
+ coalesce(filepath, '') || ' ' ||
335
+ coalesce(tags, '') || ' ' ||
336
+ coalesce(metadata, '')
337
+ ) STORED
338
+ );
339
+
340
+ -- Create a GIN index for fast full-text search
341
+ CREATE INDEX IF NOT EXISTS idx_entities_fts_search_vector
342
+ ON entities_fts USING gin(search_vector);
343
+
344
+ -- Create trigram index for fuzzy matching on filepath and search_text
345
+ CREATE INDEX IF NOT EXISTS idx_entities_fts_filepath_trgm
346
+ ON entities_fts USING gin(filepath gin_trgm_ops);
347
+ CREATE INDEX IF NOT EXISTS idx_entities_fts_search_text_trgm
348
+ ON entities_fts USING gin(search_text gin_trgm_ops);
349
+ """
350
+ )
351
+ )
352
+ conn.commit()
353
+
354
+ # Create vector table and indexes in a separate transaction
355
+ conn.execute(
356
+ text(
357
+ f"""
358
+ -- Create vector search table
359
+ CREATE TABLE IF NOT EXISTS entities_vec_v2 (
360
+ rowid INTEGER PRIMARY KEY,
361
+ embedding vector({self.settings.embedding.num_dim}),
362
+ file_type_group TEXT,
363
+ created_at_timestamp INTEGER,
364
+ file_created_at_timestamp INTEGER,
365
+ file_created_at_date TEXT,
366
+ app_name TEXT,
367
+ library_id INTEGER
368
+ );
369
+
370
+ -- Create index for vector similarity search using HNSW
371
+ CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_embedding
372
+ ON entities_vec_v2 USING hnsw (embedding vector_cosine_ops)
373
+ WITH (m = 16, ef_construction = 64);
374
+
375
+ -- Create indexes for filtering
376
+ CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_file_type_group
377
+ ON entities_vec_v2(file_type_group);
378
+ CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_file_created_at_date
379
+ ON entities_vec_v2(file_created_at_date);
380
+ CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_app_name
381
+ ON entities_vec_v2(app_name);
382
+ CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_library_id
383
+ ON entities_vec_v2(library_id);
384
+ """
385
+ )
386
+ )
387
+ conn.commit()
388
+
389
+ def recreate_index_tables(self) -> bool:
390
+ """Recreate PostgreSQL-specific index tables."""
391
+ Session = sessionmaker(bind=self.engine)
392
+
393
+ with Session() as session:
394
+ try:
395
+ # Drop existing tables
396
+ session.execute(text("DROP TABLE IF EXISTS entities_fts CASCADE"))
397
+ session.execute(text("DROP TABLE IF EXISTS entities_vec_v2 CASCADE"))
398
+ session.commit()
399
+
400
+ # Ensure extensions are created
401
+ session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
402
+ session.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
403
+ session.commit()
404
+
405
+ # Recreate entities_fts table with tsvector support
406
+ session.execute(
407
+ text(
408
+ f"""
409
+ CREATE TABLE entities_fts (
410
+ id INTEGER PRIMARY KEY,
411
+ filepath TEXT,
412
+ tags TEXT,
413
+ metadata TEXT,
414
+ search_vector tsvector GENERATED ALWAYS AS (
415
+ setweight(to_tsvector('simple', coalesce(filepath, '')), 'A') ||
416
+ setweight(to_tsvector('simple', coalesce(tags, '')), 'B') ||
417
+ setweight(to_tsvector('simple', coalesce(metadata, '')), 'C')
418
+ ) STORED,
419
+ -- Add raw text columns for prefix/substring search
420
+ search_text TEXT GENERATED ALWAYS AS (
421
+ coalesce(filepath, '') || ' ' ||
422
+ coalesce(tags, '') || ' ' ||
423
+ coalesce(metadata, '')
424
+ ) STORED
425
+ );
426
+
427
+ -- Create a GIN index for fast full-text search
428
+ CREATE INDEX idx_entities_fts_search_vector
429
+ ON entities_fts USING gin(search_vector);
430
+
431
+ -- Create trigram index for fuzzy matching on filepath and search_text
432
+ CREATE INDEX idx_entities_fts_filepath_trgm
433
+ ON entities_fts USING gin(filepath gin_trgm_ops);
434
+ CREATE INDEX idx_entities_fts_search_text_trgm
435
+ ON entities_fts USING gin(search_text gin_trgm_ops);
436
+ """
437
+ )
438
+ )
439
+ session.commit()
440
+
441
+ # Create vector table and indexes in a separate transaction
442
+ session.execute(
443
+ text(
444
+ f"""
445
+ -- Create vector search table
446
+ CREATE TABLE entities_vec_v2 (
447
+ rowid INTEGER PRIMARY KEY,
448
+ embedding vector({self.settings.embedding.num_dim}),
449
+ file_type_group TEXT,
450
+ created_at_timestamp INTEGER,
451
+ file_created_at_timestamp INTEGER,
452
+ file_created_at_date TEXT,
453
+ app_name TEXT,
454
+ library_id INTEGER
455
+ );
456
+
457
+ -- Create index for vector similarity search using HNSW
458
+ CREATE INDEX idx_entities_vec_v2_embedding
459
+ ON entities_vec_v2 USING hnsw (embedding vector_cosine_ops)
460
+ WITH (m = 16, ef_construction = 64);
461
+
462
+ -- Create indexes for filtering
463
+ CREATE INDEX idx_entities_vec_v2_file_type_group
464
+ ON entities_vec_v2(file_type_group);
465
+ CREATE INDEX idx_entities_vec_v2_file_created_at_date
466
+ ON entities_vec_v2(file_created_at_date);
467
+ CREATE INDEX idx_entities_vec_v2_app_name
468
+ ON entities_vec_v2(app_name);
469
+ CREATE INDEX idx_entities_vec_v2_library_id
470
+ ON entities_vec_v2(library_id);
471
+ """
472
+ )
473
+ )
474
+ session.commit()
475
+
476
+ print("Successfully recreated entities_fts and entities_vec_v2 tables.")
477
+ return True
478
+ except Exception as e:
479
+ session.rollback()
480
+ print(f"Error recreating tables: {e}")
481
+ return False
@@ -0,0 +1,165 @@
1
+ import json
2
+ import argparse
3
+ from sqlalchemy.orm import sessionmaker
4
+ from memos.models import EntityModel, EntityMetadataModel
5
+ from memos.config import get_database_path
6
+ from sqlalchemy import create_engine
7
+ from tqdm import tqdm
8
+ from pathlib import Path
9
+ import argilla as rg
10
+ from PIL import Image
11
+ import io
12
+
13
+
14
+ def prepare_huggingface_dataset(output_file, batch_size=100, record_count=10000):
15
+ """Prepare a Hugging Face dataset and save it as JSONL."""
16
+ db_path = get_database_path()
17
+ engine = create_engine(f"sqlite:///{db_path}")
18
+ Session = sessionmaker(bind=engine)
19
+
20
+ with Session() as session, open(output_file, "w", encoding="utf-8") as f:
21
+ query = session.query(EntityModel)
22
+ total = query.count()
23
+
24
+ progress_bar = tqdm(
25
+ total=min(total, record_count), desc="Processing entities", unit="entity"
26
+ )
27
+ inserted_records = 0
28
+
29
+ for offset in range(0, total, batch_size):
30
+ batch = query.limit(batch_size).offset(offset).all()
31
+
32
+ for entity in batch:
33
+ # Skip entities with "low_info" tag
34
+ if any(tag.name == "low_info" for tag in entity.tags):
35
+ progress_bar.update(1)
36
+ continue
37
+
38
+ metadata = {entry.key: entry.value for entry in entity.metadata_entries}
39
+
40
+ answer = metadata.get("internvl-72b-result") or metadata.get(
41
+ "internvl_result"
42
+ )
43
+ if not answer or not Path(entity.filepath).exists():
44
+ progress_bar.update(1)
45
+ continue
46
+
47
+ record = {
48
+ "id": entity.id,
49
+ "image": entity.filepath,
50
+ "question": "<MORE_DETAILED_CAPTION>",
51
+ "answer": answer,
52
+ }
53
+ json.dump(record, f, ensure_ascii=False)
54
+ f.write("\n")
55
+ progress_bar.update(1)
56
+ inserted_records += 1
57
+
58
+ if inserted_records >= record_count:
59
+ break
60
+ if inserted_records >= record_count:
61
+ break
62
+
63
+ progress_bar.close()
64
+
65
+ print(f"Dataset saved to {output_file}")
66
+
67
+
68
+ def init_argilla_dataset(client, dataset_name="image_captioning"):
69
+ workspace_name = "argilla"
70
+
71
+ workspace = client.workspaces(workspace_name)
72
+
73
+ if workspace is None:
74
+ workspace = rg.Workspace(name=workspace_name, client=client)
75
+ workspace.create()
76
+ print(f"Workspace created: {workspace_name}")
77
+
78
+ dataset = client.datasets(name=dataset_name)
79
+
80
+ if dataset is not None:
81
+ return dataset
82
+
83
+ settings = rg.Settings(
84
+ fields=[
85
+ rg.ImageField(name="image"),
86
+ rg.TextField(name="filepath")
87
+ ],
88
+ questions=[
89
+ rg.TextQuestion(
90
+ name="text",
91
+ title="Description of the image",
92
+ required=True,
93
+ use_markdown=True,
94
+ )
95
+ ],
96
+ )
97
+
98
+ dataset = rg.Dataset(
99
+ name=dataset_name, workspace=workspace_name, settings=settings, client=client
100
+ )
101
+
102
+ dataset.create()
103
+ print(f"Dataset created: {dataset_name}")
104
+
105
+ return dataset
106
+
107
+
108
+ def upload_to_argilla(input_file, batch_size=10, dataset_name="image_captioning"):
109
+ """Upload a JSONL dataset to Argilla."""
110
+
111
+ client = rg.Argilla(api_url="http://localhost:6900", api_key="argilla.apikey")
112
+
113
+ dataset = init_argilla_dataset(client, dataset_name)
114
+
115
+ records = []
116
+ total_records = sum(1 for _ in open(input_file, "r"))
117
+
118
+ with open(input_file, "r", encoding="utf-8") as f:
119
+ progress_bar = tqdm(
120
+ total=total_records, desc="Uploading to Argilla", unit="record"
121
+ )
122
+
123
+ for line in f:
124
+ record_data = json.loads(line)
125
+ image = Image.open(record_data["image"]).convert("RGB")
126
+ image.thumbnail((1280, 1280))
127
+
128
+ rg_record = rg.Record(
129
+ id=str(record_data["id"]),
130
+ fields={
131
+ "image": image,
132
+ "filepath": record_data["image"],
133
+ },
134
+ suggestions=[
135
+ rg.Suggestion(
136
+ "text", record_data["answer"], score=1.0, agent="internvl2"
137
+ )
138
+ ],
139
+ )
140
+ records.append(rg_record)
141
+
142
+ if len(records) >= batch_size:
143
+ dataset.records.log(records)
144
+ progress_bar.update(batch_size)
145
+ records = []
146
+
147
+ if records:
148
+ dataset.records.log(records)
149
+ progress_bar.update(len(records))
150
+
151
+ progress_bar.close()
152
+
153
+ print(f"Dataset uploaded to Argilla: {dataset_name}")
154
+
155
+
156
+ if __name__ == "__main__":
157
+ parser = argparse.ArgumentParser(description="Prepare and upload dataset")
158
+ parser.add_argument("--output_file", default="dataset.jsonl", help="Output file path")
159
+ parser.add_argument("--size", type=int, default=10000, help="Number of records to extract")
160
+ args = parser.parse_args()
161
+
162
+ prepare_huggingface_dataset(args.output_file, record_count=args.size)
163
+ print(f"Dataset saved to {args.output_file}")
164
+ # Uncomment the following line if you want to upload to Argilla
165
+ # upload_to_argilla(args.output_file)