pensiev 0.25.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memos/__init__.py +6 -0
- memos/cmds/__init__.py +0 -0
- memos/cmds/library.py +1289 -0
- memos/cmds/plugin.py +96 -0
- memos/commands.py +865 -0
- memos/config.py +225 -0
- memos/crud.py +605 -0
- memos/databases/__init__.py +0 -0
- memos/databases/initializers.py +481 -0
- memos/dataset_extractor_for_florence.py +165 -0
- memos/dataset_extractor_for_internvl2.py +192 -0
- memos/default_config.yaml +88 -0
- memos/embedding.py +129 -0
- memos/frame_extractor.py +53 -0
- memos/logging_config.py +35 -0
- memos/main.py +104 -0
- memos/migrations/alembic/README +1 -0
- memos/migrations/alembic/__pycache__/env.cpython-310.pyc +0 -0
- memos/migrations/alembic/env.py +108 -0
- memos/migrations/alembic/script.py.mako +30 -0
- memos/migrations/alembic/versions/00904ac8c6fc_add_indexes_to_entitymodel.py +63 -0
- memos/migrations/alembic/versions/04acdaf75664_add_indices_to_entitytags_and_metadata.py +86 -0
- memos/migrations/alembic/versions/12504c5b1d3c_add_extra_columns_for_embedding.py +67 -0
- memos/migrations/alembic/versions/31a1ad0e10b3_add_entity_plugin_status.py +71 -0
- memos/migrations/alembic/versions/__pycache__/00904ac8c6fc_add_indexes_to_entitymodel.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/04acdaf75664_add_indices_to_entitytags_and_metadata.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/12504c5b1d3c_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/20f5ecab014d_add_entity_plugin_status.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/31a1ad0e10b3_add_entity_plugin_status.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/4fcb062c5128_add_extra_columns_for_embedding.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/d10c55fbb7d2_add_index_for_entity_file_type_group_.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/__pycache__/f8f158182416_add_active_app_index.cpython-310.pyc +0 -0
- memos/migrations/alembic/versions/d10c55fbb7d2_add_index_for_entity_file_type_group_.py +44 -0
- memos/migrations/alembic/versions/f8f158182416_add_active_app_index.py +75 -0
- memos/migrations/alembic.ini +116 -0
- memos/migrations.py +19 -0
- memos/models.py +199 -0
- memos/plugins/__init__.py +0 -0
- memos/plugins/ocr/__init__.py +0 -0
- memos/plugins/ocr/main.py +251 -0
- memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx +0 -0
- memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx +0 -0
- memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx +0 -0
- memos/plugins/ocr/ppocr-gpu.yaml +43 -0
- memos/plugins/ocr/ppocr.yaml +44 -0
- memos/plugins/ocr/server.py +227 -0
- memos/plugins/ocr/temp_ppocr.yaml +42 -0
- memos/plugins/vlm/__init__.py +0 -0
- memos/plugins/vlm/main.py +251 -0
- memos/prepare_dataset.py +107 -0
- memos/process_webp.py +55 -0
- memos/read_metadata.py +32 -0
- memos/record.py +358 -0
- memos/schemas.py +289 -0
- memos/search.py +1198 -0
- memos/server.py +883 -0
- memos/shotsum.py +105 -0
- memos/shotsum_with_ocr.py +145 -0
- memos/simple_tokenizer/dict/README.md +31 -0
- memos/simple_tokenizer/dict/hmm_model.utf8 +34 -0
- memos/simple_tokenizer/dict/idf.utf8 +258826 -0
- memos/simple_tokenizer/dict/jieba.dict.utf8 +348982 -0
- memos/simple_tokenizer/dict/pos_dict/char_state_tab.utf8 +6653 -0
- memos/simple_tokenizer/dict/pos_dict/prob_emit.utf8 +166 -0
- memos/simple_tokenizer/dict/pos_dict/prob_start.utf8 +259 -0
- memos/simple_tokenizer/dict/pos_dict/prob_trans.utf8 +5222 -0
- memos/simple_tokenizer/dict/stop_words.utf8 +1534 -0
- memos/simple_tokenizer/dict/user.dict.utf8 +4 -0
- memos/simple_tokenizer/linux/libsimple.so +0 -0
- memos/simple_tokenizer/macos/libsimple.dylib +0 -0
- memos/simple_tokenizer/windows/simple.dll +0 -0
- memos/static/_app/immutable/assets/0.e250c031.css +1 -0
- memos/static/_app/immutable/assets/_layout.e7937cfe.css +1 -0
- memos/static/_app/immutable/chunks/index.5c08976b.js +1 -0
- memos/static/_app/immutable/chunks/index.60ee613b.js +4 -0
- memos/static/_app/immutable/chunks/runtime.a7926cf6.js +5 -0
- memos/static/_app/immutable/chunks/scheduler.5c1cff6e.js +1 -0
- memos/static/_app/immutable/chunks/singletons.583bdf4e.js +1 -0
- memos/static/_app/immutable/entry/app.666c1643.js +1 -0
- memos/static/_app/immutable/entry/start.aed5c701.js +3 -0
- memos/static/_app/immutable/nodes/0.5862ea38.js +7 -0
- memos/static/_app/immutable/nodes/1.35378a5e.js +1 -0
- memos/static/_app/immutable/nodes/2.1ccf9ea5.js +81 -0
- memos/static/_app/version.json +1 -0
- memos/static/app.html +36 -0
- memos/static/favicon.png +0 -0
- memos/static/logos/memos_logo_1024.png +0 -0
- memos/static/logos/memos_logo_1024@2x.png +0 -0
- memos/static/logos/memos_logo_128.png +0 -0
- memos/static/logos/memos_logo_128@2x.png +0 -0
- memos/static/logos/memos_logo_16.png +0 -0
- memos/static/logos/memos_logo_16@2x.png +0 -0
- memos/static/logos/memos_logo_256.png +0 -0
- memos/static/logos/memos_logo_256@2x.png +0 -0
- memos/static/logos/memos_logo_32.png +0 -0
- memos/static/logos/memos_logo_32@2x.png +0 -0
- memos/static/logos/memos_logo_512.png +0 -0
- memos/static/logos/memos_logo_512@2x.png +0 -0
- memos/static/logos/memos_logo_64.png +0 -0
- memos/static/logos/memos_logo_64@2x.png +0 -0
- memos/test_server.py +802 -0
- memos/utils.py +49 -0
- memos_ml_backends/florence2_server.py +176 -0
- memos_ml_backends/qwen2vl_server.py +182 -0
- memos_ml_backends/schemas.py +48 -0
- pensiev-0.25.5.dist-info/LICENSE +201 -0
- pensiev-0.25.5.dist-info/METADATA +541 -0
- pensiev-0.25.5.dist-info/RECORD +111 -0
- pensiev-0.25.5.dist-info/WHEEL +5 -0
- pensiev-0.25.5.dist-info/entry_points.txt +2 -0
- pensiev-0.25.5.dist-info/top_level.txt +2 -0
@@ -0,0 +1,481 @@
|
|
1
|
+
"""Database initializer classes for different database backends."""
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from pathlib import Path
|
5
|
+
from sqlalchemy import create_engine, event, text
|
6
|
+
from sqlalchemy.exc import OperationalError
|
7
|
+
from sqlalchemy.orm import sessionmaker
|
8
|
+
import sqlite_vec
|
9
|
+
|
10
|
+
from ..models import RawBase, PluginModel, LibraryModel, LibraryPluginModel
|
11
|
+
|
12
|
+
|
13
|
+
def setup_database(settings, **engine_kwargs):
|
14
|
+
"""Set up and initialize the database.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
settings: Application settings containing database configuration
|
18
|
+
**engine_kwargs: Additional keyword arguments to pass to create_engine
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
tuple: (engine, initializer) where engine is the SQLAlchemy engine and
|
22
|
+
initializer is the appropriate DatabaseInitializer instance
|
23
|
+
"""
|
24
|
+
engine, initializer = create_db_initializer(settings, **engine_kwargs)
|
25
|
+
initializer.init_database()
|
26
|
+
return engine, initializer
|
27
|
+
|
28
|
+
|
29
|
+
def init_database(settings):
|
30
|
+
"""Initialize the database."""
|
31
|
+
engine, initializer = create_db_initializer(settings)
|
32
|
+
return initializer.init_database()
|
33
|
+
|
34
|
+
|
35
|
+
def recreate_fts_and_vec_tables(settings):
|
36
|
+
"""Recreate the database-specific tables without repopulating data."""
|
37
|
+
engine, initializer = create_db_initializer(settings)
|
38
|
+
return initializer.recreate_index_tables()
|
39
|
+
|
40
|
+
|
41
|
+
def initialize_default_plugins(session):
|
42
|
+
"""Initialize default plugins in the database."""
|
43
|
+
default_plugins = [
|
44
|
+
PluginModel(
|
45
|
+
name="builtin_vlm", description="VLM Plugin", webhook_url="/plugins/vlm"
|
46
|
+
),
|
47
|
+
PluginModel(
|
48
|
+
name="builtin_ocr", description="OCR Plugin", webhook_url="/plugins/ocr"
|
49
|
+
),
|
50
|
+
]
|
51
|
+
|
52
|
+
for plugin in default_plugins:
|
53
|
+
existing_plugin = session.query(PluginModel).filter_by(name=plugin.name).first()
|
54
|
+
if not existing_plugin:
|
55
|
+
session.add(plugin)
|
56
|
+
|
57
|
+
session.commit()
|
58
|
+
|
59
|
+
return default_plugins
|
60
|
+
|
61
|
+
|
62
|
+
def init_default_libraries(session, default_plugins, settings):
|
63
|
+
"""Initialize default libraries and associate them with plugins."""
|
64
|
+
default_libraries = [
|
65
|
+
LibraryModel(name=settings.default_library),
|
66
|
+
]
|
67
|
+
|
68
|
+
for library in default_libraries:
|
69
|
+
existing_library = (
|
70
|
+
session.query(LibraryModel).filter_by(name=library.name).first()
|
71
|
+
)
|
72
|
+
if not existing_library:
|
73
|
+
session.add(library)
|
74
|
+
|
75
|
+
for plugin in default_plugins:
|
76
|
+
bind_response = session.query(PluginModel).filter_by(name=plugin.name).first()
|
77
|
+
if bind_response:
|
78
|
+
# Check if the LibraryPluginModel already exists
|
79
|
+
existing_library_plugin = (
|
80
|
+
session.query(LibraryPluginModel)
|
81
|
+
.filter_by(library_id=1, plugin_id=bind_response.id)
|
82
|
+
.first()
|
83
|
+
)
|
84
|
+
|
85
|
+
if not existing_library_plugin:
|
86
|
+
library_plugin = LibraryPluginModel(
|
87
|
+
library_id=1, plugin_id=bind_response.id
|
88
|
+
) # Assuming library_id=1 for default libraries
|
89
|
+
session.add(library_plugin)
|
90
|
+
|
91
|
+
session.commit()
|
92
|
+
|
93
|
+
|
94
|
+
def create_db_initializer(settings, **engine_kwargs):
|
95
|
+
"""Create a database engine and initializer based on settings.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
settings: Application settings containing database configuration
|
99
|
+
**engine_kwargs: Additional keyword arguments to pass to create_engine
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
tuple: (engine, initializer) where engine is the SQLAlchemy engine and
|
103
|
+
initializer is the appropriate DatabaseInitializer instance
|
104
|
+
"""
|
105
|
+
default_engine_kwargs = {
|
106
|
+
"pool_size": 10,
|
107
|
+
"max_overflow": 20,
|
108
|
+
"pool_timeout": 60,
|
109
|
+
"pool_recycle": 3600,
|
110
|
+
}
|
111
|
+
|
112
|
+
if settings.is_sqlite:
|
113
|
+
default_engine_kwargs["connect_args"] = {"timeout": 60}
|
114
|
+
|
115
|
+
# Override defaults with any provided kwargs
|
116
|
+
default_engine_kwargs.update(engine_kwargs)
|
117
|
+
|
118
|
+
engine = create_engine(
|
119
|
+
settings.database_url,
|
120
|
+
**default_engine_kwargs
|
121
|
+
)
|
122
|
+
|
123
|
+
# Create the appropriate initializer based on database type
|
124
|
+
if settings.is_sqlite:
|
125
|
+
initializer = SQLiteInitializer(engine, settings)
|
126
|
+
else:
|
127
|
+
print("Using PostgreSQL")
|
128
|
+
initializer = PostgreSQLInitializer(engine, settings)
|
129
|
+
|
130
|
+
return engine, initializer
|
131
|
+
|
132
|
+
|
133
|
+
class DatabaseInitializer:
|
134
|
+
"""Base class for database initialization."""
|
135
|
+
def __init__(self, engine, settings):
|
136
|
+
self.engine = engine
|
137
|
+
self.settings = settings
|
138
|
+
|
139
|
+
def init_database(self) -> bool:
|
140
|
+
"""Initialize the database with common tables and data."""
|
141
|
+
try:
|
142
|
+
# Create all tables defined in SQLAlchemy models
|
143
|
+
RawBase.metadata.create_all(self.engine)
|
144
|
+
print(f"Database initialized successfully at {self.settings.database_url}")
|
145
|
+
|
146
|
+
# Initialize database-specific features
|
147
|
+
self.init_specific_features()
|
148
|
+
|
149
|
+
# Initialize default data
|
150
|
+
Session = sessionmaker(bind=self.engine)
|
151
|
+
with Session() as session:
|
152
|
+
default_plugins = initialize_default_plugins(session)
|
153
|
+
init_default_libraries(session, default_plugins, self.settings)
|
154
|
+
|
155
|
+
return True
|
156
|
+
except OperationalError as e:
|
157
|
+
print(f"Error initializing database: {e}")
|
158
|
+
return False
|
159
|
+
|
160
|
+
def init_extensions(self):
|
161
|
+
"""Initialize database extensions. To be implemented by subclasses."""
|
162
|
+
pass
|
163
|
+
|
164
|
+
def init_specific_features(self):
|
165
|
+
"""Initialize database-specific features. To be implemented by subclasses."""
|
166
|
+
pass
|
167
|
+
|
168
|
+
def recreate_index_tables(self) -> bool:
|
169
|
+
"""Recreate database-specific index tables. To be implemented by subclasses."""
|
170
|
+
pass
|
171
|
+
|
172
|
+
|
173
|
+
class SQLiteInitializer(DatabaseInitializer):
|
174
|
+
"""SQLite-specific database initializer."""
|
175
|
+
def __init__(self, engine, settings):
|
176
|
+
super().__init__(engine, settings)
|
177
|
+
self.init_extensions()
|
178
|
+
|
179
|
+
def init_extensions(self):
|
180
|
+
"""Initialize SQLite-specific extensions."""
|
181
|
+
event.listen(self.engine, "connect", self._load_sqlite_extensions)
|
182
|
+
|
183
|
+
def _load_sqlite_extensions(self, dbapi_conn, connection_record):
|
184
|
+
"""Load SQLite extensions for full-text search and vector operations."""
|
185
|
+
try:
|
186
|
+
dbapi_conn.enable_load_extension(True)
|
187
|
+
except AttributeError as e:
|
188
|
+
print("Error: Current SQLite3 build doesn't support loading extensions.")
|
189
|
+
print("\nRecommended solutions:")
|
190
|
+
print("1. Install Python using Conda (recommended for both Windows and macOS):")
|
191
|
+
print(" conda create -n yourenv python")
|
192
|
+
print(" conda activate yourenv")
|
193
|
+
print("\n2. Or on macOS, you can use Homebrew:")
|
194
|
+
print(" brew install python")
|
195
|
+
print(f"\nDetailed error: {str(e)}")
|
196
|
+
raise
|
197
|
+
|
198
|
+
# load simple tokenizer
|
199
|
+
current_dir = Path(__file__).parent.parent.resolve()
|
200
|
+
if sys.platform.startswith("linux"):
|
201
|
+
lib_path = current_dir / "simple_tokenizer" / "linux" / "libsimple"
|
202
|
+
elif sys.platform == "win32":
|
203
|
+
lib_path = current_dir / "simple_tokenizer" / "windows" / "simple"
|
204
|
+
elif sys.platform == "darwin":
|
205
|
+
lib_path = current_dir / "simple_tokenizer" / "macos" / "libsimple"
|
206
|
+
else:
|
207
|
+
raise OSError(f"Unsupported operating system: {sys.platform}")
|
208
|
+
|
209
|
+
dbapi_conn.load_extension(str(lib_path))
|
210
|
+
dict_path = current_dir / "simple_tokenizer" / "dict"
|
211
|
+
dbapi_conn.execute(f"SELECT jieba_dict('{dict_path}')")
|
212
|
+
|
213
|
+
# load vector ext
|
214
|
+
sqlite_vec.load(dbapi_conn)
|
215
|
+
|
216
|
+
# Set WAL mode after loading extensions
|
217
|
+
dbapi_conn.execute("PRAGMA journal_mode=WAL")
|
218
|
+
|
219
|
+
def init_specific_features(self):
|
220
|
+
"""Initialize SQLite-specific features like FTS and vector extensions."""
|
221
|
+
# Create FTS and Vec tables
|
222
|
+
with self.engine.connect() as conn:
|
223
|
+
conn.execute(
|
224
|
+
text(
|
225
|
+
"""
|
226
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS entities_fts USING fts5(
|
227
|
+
id, filepath, tags, metadata,
|
228
|
+
tokenize = 'simple 0',
|
229
|
+
prefix = '2 3 4'
|
230
|
+
)
|
231
|
+
"""
|
232
|
+
)
|
233
|
+
)
|
234
|
+
|
235
|
+
conn.execute(
|
236
|
+
text(
|
237
|
+
f"""
|
238
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS entities_vec_v2 USING vec0(
|
239
|
+
embedding float[{self.settings.embedding.num_dim}] distance_metric=cosine,
|
240
|
+
file_type_group text,
|
241
|
+
created_at_timestamp integer,
|
242
|
+
file_created_at_timestamp integer,
|
243
|
+
file_created_at_date text partition key,
|
244
|
+
app_name text,
|
245
|
+
library_id integer
|
246
|
+
)
|
247
|
+
"""
|
248
|
+
)
|
249
|
+
)
|
250
|
+
|
251
|
+
def recreate_index_tables(self) -> bool:
|
252
|
+
"""Recreate SQLite-specific index tables (FTS and vector tables)."""
|
253
|
+
Session = sessionmaker(bind=self.engine)
|
254
|
+
|
255
|
+
with Session() as session:
|
256
|
+
try:
|
257
|
+
# Drop existing tables
|
258
|
+
session.execute(text("DROP TABLE IF EXISTS entities_fts"))
|
259
|
+
session.execute(text("DROP TABLE IF EXISTS entities_vec_v2"))
|
260
|
+
|
261
|
+
# Recreate entities_fts table
|
262
|
+
session.execute(
|
263
|
+
text(
|
264
|
+
"""
|
265
|
+
CREATE VIRTUAL TABLE entities_fts USING fts5(
|
266
|
+
id, filepath, tags, metadata,
|
267
|
+
tokenize = 'simple 0',
|
268
|
+
prefix = '2 3 4'
|
269
|
+
)
|
270
|
+
"""
|
271
|
+
)
|
272
|
+
)
|
273
|
+
|
274
|
+
# Recreate entities_vec_v2 table
|
275
|
+
session.execute(
|
276
|
+
text(
|
277
|
+
f"""
|
278
|
+
CREATE VIRTUAL TABLE entities_vec_v2 USING vec0(
|
279
|
+
embedding float[{self.settings.embedding.num_dim}] distance_metric=cosine,
|
280
|
+
file_type_group text,
|
281
|
+
created_at_timestamp integer,
|
282
|
+
file_created_at_timestamp integer,
|
283
|
+
file_created_at_date text partition key,
|
284
|
+
app_name text,
|
285
|
+
library_id integer
|
286
|
+
)
|
287
|
+
"""
|
288
|
+
)
|
289
|
+
)
|
290
|
+
|
291
|
+
session.commit()
|
292
|
+
print("Successfully recreated entities_fts and entities_vec_v2 tables.")
|
293
|
+
return True
|
294
|
+
except Exception as e:
|
295
|
+
session.rollback()
|
296
|
+
print(f"Error recreating tables: {e}")
|
297
|
+
return False
|
298
|
+
|
299
|
+
|
300
|
+
class PostgreSQLInitializer(DatabaseInitializer):
|
301
|
+
def __init__(self, engine, settings):
|
302
|
+
super().__init__(engine, settings)
|
303
|
+
self.init_extensions()
|
304
|
+
|
305
|
+
"""PostgreSQL-specific database initializer."""
|
306
|
+
def init_extensions(self):
|
307
|
+
"""Initialize PostgreSQL-specific extensions."""
|
308
|
+
with self.engine.connect() as conn:
|
309
|
+
# Create extensions in a separate transaction
|
310
|
+
conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
|
311
|
+
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
312
|
+
conn.commit()
|
313
|
+
|
314
|
+
def init_specific_features(self):
|
315
|
+
"""Initialize PostgreSQL-specific features."""
|
316
|
+
with self.engine.connect() as conn:
|
317
|
+
# Create the tsvector column and index for full-text search
|
318
|
+
conn.execute(
|
319
|
+
text(
|
320
|
+
f"""
|
321
|
+
-- Create a table to store the full-text search data
|
322
|
+
CREATE TABLE IF NOT EXISTS entities_fts (
|
323
|
+
id INTEGER PRIMARY KEY,
|
324
|
+
filepath TEXT,
|
325
|
+
tags TEXT,
|
326
|
+
metadata TEXT,
|
327
|
+
search_vector tsvector GENERATED ALWAYS AS (
|
328
|
+
setweight(to_tsvector('simple', coalesce(filepath, '')), 'A') ||
|
329
|
+
setweight(to_tsvector('simple', coalesce(tags, '')), 'B') ||
|
330
|
+
setweight(to_tsvector('simple', coalesce(metadata, '')), 'C')
|
331
|
+
) STORED,
|
332
|
+
-- Add raw text columns for prefix/substring search
|
333
|
+
search_text TEXT GENERATED ALWAYS AS (
|
334
|
+
coalesce(filepath, '') || ' ' ||
|
335
|
+
coalesce(tags, '') || ' ' ||
|
336
|
+
coalesce(metadata, '')
|
337
|
+
) STORED
|
338
|
+
);
|
339
|
+
|
340
|
+
-- Create a GIN index for fast full-text search
|
341
|
+
CREATE INDEX IF NOT EXISTS idx_entities_fts_search_vector
|
342
|
+
ON entities_fts USING gin(search_vector);
|
343
|
+
|
344
|
+
-- Create trigram index for fuzzy matching on filepath and search_text
|
345
|
+
CREATE INDEX IF NOT EXISTS idx_entities_fts_filepath_trgm
|
346
|
+
ON entities_fts USING gin(filepath gin_trgm_ops);
|
347
|
+
CREATE INDEX IF NOT EXISTS idx_entities_fts_search_text_trgm
|
348
|
+
ON entities_fts USING gin(search_text gin_trgm_ops);
|
349
|
+
"""
|
350
|
+
)
|
351
|
+
)
|
352
|
+
conn.commit()
|
353
|
+
|
354
|
+
# Create vector table and indexes in a separate transaction
|
355
|
+
conn.execute(
|
356
|
+
text(
|
357
|
+
f"""
|
358
|
+
-- Create vector search table
|
359
|
+
CREATE TABLE IF NOT EXISTS entities_vec_v2 (
|
360
|
+
rowid INTEGER PRIMARY KEY,
|
361
|
+
embedding vector({self.settings.embedding.num_dim}),
|
362
|
+
file_type_group TEXT,
|
363
|
+
created_at_timestamp INTEGER,
|
364
|
+
file_created_at_timestamp INTEGER,
|
365
|
+
file_created_at_date TEXT,
|
366
|
+
app_name TEXT,
|
367
|
+
library_id INTEGER
|
368
|
+
);
|
369
|
+
|
370
|
+
-- Create index for vector similarity search using HNSW
|
371
|
+
CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_embedding
|
372
|
+
ON entities_vec_v2 USING hnsw (embedding vector_cosine_ops)
|
373
|
+
WITH (m = 16, ef_construction = 64);
|
374
|
+
|
375
|
+
-- Create indexes for filtering
|
376
|
+
CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_file_type_group
|
377
|
+
ON entities_vec_v2(file_type_group);
|
378
|
+
CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_file_created_at_date
|
379
|
+
ON entities_vec_v2(file_created_at_date);
|
380
|
+
CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_app_name
|
381
|
+
ON entities_vec_v2(app_name);
|
382
|
+
CREATE INDEX IF NOT EXISTS idx_entities_vec_v2_library_id
|
383
|
+
ON entities_vec_v2(library_id);
|
384
|
+
"""
|
385
|
+
)
|
386
|
+
)
|
387
|
+
conn.commit()
|
388
|
+
|
389
|
+
def recreate_index_tables(self) -> bool:
|
390
|
+
"""Recreate PostgreSQL-specific index tables."""
|
391
|
+
Session = sessionmaker(bind=self.engine)
|
392
|
+
|
393
|
+
with Session() as session:
|
394
|
+
try:
|
395
|
+
# Drop existing tables
|
396
|
+
session.execute(text("DROP TABLE IF EXISTS entities_fts CASCADE"))
|
397
|
+
session.execute(text("DROP TABLE IF EXISTS entities_vec_v2 CASCADE"))
|
398
|
+
session.commit()
|
399
|
+
|
400
|
+
# Ensure extensions are created
|
401
|
+
session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm"))
|
402
|
+
session.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
|
403
|
+
session.commit()
|
404
|
+
|
405
|
+
# Recreate entities_fts table with tsvector support
|
406
|
+
session.execute(
|
407
|
+
text(
|
408
|
+
f"""
|
409
|
+
CREATE TABLE entities_fts (
|
410
|
+
id INTEGER PRIMARY KEY,
|
411
|
+
filepath TEXT,
|
412
|
+
tags TEXT,
|
413
|
+
metadata TEXT,
|
414
|
+
search_vector tsvector GENERATED ALWAYS AS (
|
415
|
+
setweight(to_tsvector('simple', coalesce(filepath, '')), 'A') ||
|
416
|
+
setweight(to_tsvector('simple', coalesce(tags, '')), 'B') ||
|
417
|
+
setweight(to_tsvector('simple', coalesce(metadata, '')), 'C')
|
418
|
+
) STORED,
|
419
|
+
-- Add raw text columns for prefix/substring search
|
420
|
+
search_text TEXT GENERATED ALWAYS AS (
|
421
|
+
coalesce(filepath, '') || ' ' ||
|
422
|
+
coalesce(tags, '') || ' ' ||
|
423
|
+
coalesce(metadata, '')
|
424
|
+
) STORED
|
425
|
+
);
|
426
|
+
|
427
|
+
-- Create a GIN index for fast full-text search
|
428
|
+
CREATE INDEX idx_entities_fts_search_vector
|
429
|
+
ON entities_fts USING gin(search_vector);
|
430
|
+
|
431
|
+
-- Create trigram index for fuzzy matching on filepath and search_text
|
432
|
+
CREATE INDEX idx_entities_fts_filepath_trgm
|
433
|
+
ON entities_fts USING gin(filepath gin_trgm_ops);
|
434
|
+
CREATE INDEX idx_entities_fts_search_text_trgm
|
435
|
+
ON entities_fts USING gin(search_text gin_trgm_ops);
|
436
|
+
"""
|
437
|
+
)
|
438
|
+
)
|
439
|
+
session.commit()
|
440
|
+
|
441
|
+
# Create vector table and indexes in a separate transaction
|
442
|
+
session.execute(
|
443
|
+
text(
|
444
|
+
f"""
|
445
|
+
-- Create vector search table
|
446
|
+
CREATE TABLE entities_vec_v2 (
|
447
|
+
rowid INTEGER PRIMARY KEY,
|
448
|
+
embedding vector({self.settings.embedding.num_dim}),
|
449
|
+
file_type_group TEXT,
|
450
|
+
created_at_timestamp INTEGER,
|
451
|
+
file_created_at_timestamp INTEGER,
|
452
|
+
file_created_at_date TEXT,
|
453
|
+
app_name TEXT,
|
454
|
+
library_id INTEGER
|
455
|
+
);
|
456
|
+
|
457
|
+
-- Create index for vector similarity search using HNSW
|
458
|
+
CREATE INDEX idx_entities_vec_v2_embedding
|
459
|
+
ON entities_vec_v2 USING hnsw (embedding vector_cosine_ops)
|
460
|
+
WITH (m = 16, ef_construction = 64);
|
461
|
+
|
462
|
+
-- Create indexes for filtering
|
463
|
+
CREATE INDEX idx_entities_vec_v2_file_type_group
|
464
|
+
ON entities_vec_v2(file_type_group);
|
465
|
+
CREATE INDEX idx_entities_vec_v2_file_created_at_date
|
466
|
+
ON entities_vec_v2(file_created_at_date);
|
467
|
+
CREATE INDEX idx_entities_vec_v2_app_name
|
468
|
+
ON entities_vec_v2(app_name);
|
469
|
+
CREATE INDEX idx_entities_vec_v2_library_id
|
470
|
+
ON entities_vec_v2(library_id);
|
471
|
+
"""
|
472
|
+
)
|
473
|
+
)
|
474
|
+
session.commit()
|
475
|
+
|
476
|
+
print("Successfully recreated entities_fts and entities_vec_v2 tables.")
|
477
|
+
return True
|
478
|
+
except Exception as e:
|
479
|
+
session.rollback()
|
480
|
+
print(f"Error recreating tables: {e}")
|
481
|
+
return False
|
@@ -0,0 +1,165 @@
|
|
1
|
+
import json
|
2
|
+
import argparse
|
3
|
+
from sqlalchemy.orm import sessionmaker
|
4
|
+
from memos.models import EntityModel, EntityMetadataModel
|
5
|
+
from memos.config import get_database_path
|
6
|
+
from sqlalchemy import create_engine
|
7
|
+
from tqdm import tqdm
|
8
|
+
from pathlib import Path
|
9
|
+
import argilla as rg
|
10
|
+
from PIL import Image
|
11
|
+
import io
|
12
|
+
|
13
|
+
|
14
|
+
def prepare_huggingface_dataset(output_file, batch_size=100, record_count=10000):
|
15
|
+
"""Prepare a Hugging Face dataset and save it as JSONL."""
|
16
|
+
db_path = get_database_path()
|
17
|
+
engine = create_engine(f"sqlite:///{db_path}")
|
18
|
+
Session = sessionmaker(bind=engine)
|
19
|
+
|
20
|
+
with Session() as session, open(output_file, "w", encoding="utf-8") as f:
|
21
|
+
query = session.query(EntityModel)
|
22
|
+
total = query.count()
|
23
|
+
|
24
|
+
progress_bar = tqdm(
|
25
|
+
total=min(total, record_count), desc="Processing entities", unit="entity"
|
26
|
+
)
|
27
|
+
inserted_records = 0
|
28
|
+
|
29
|
+
for offset in range(0, total, batch_size):
|
30
|
+
batch = query.limit(batch_size).offset(offset).all()
|
31
|
+
|
32
|
+
for entity in batch:
|
33
|
+
# Skip entities with "low_info" tag
|
34
|
+
if any(tag.name == "low_info" for tag in entity.tags):
|
35
|
+
progress_bar.update(1)
|
36
|
+
continue
|
37
|
+
|
38
|
+
metadata = {entry.key: entry.value for entry in entity.metadata_entries}
|
39
|
+
|
40
|
+
answer = metadata.get("internvl-72b-result") or metadata.get(
|
41
|
+
"internvl_result"
|
42
|
+
)
|
43
|
+
if not answer or not Path(entity.filepath).exists():
|
44
|
+
progress_bar.update(1)
|
45
|
+
continue
|
46
|
+
|
47
|
+
record = {
|
48
|
+
"id": entity.id,
|
49
|
+
"image": entity.filepath,
|
50
|
+
"question": "<MORE_DETAILED_CAPTION>",
|
51
|
+
"answer": answer,
|
52
|
+
}
|
53
|
+
json.dump(record, f, ensure_ascii=False)
|
54
|
+
f.write("\n")
|
55
|
+
progress_bar.update(1)
|
56
|
+
inserted_records += 1
|
57
|
+
|
58
|
+
if inserted_records >= record_count:
|
59
|
+
break
|
60
|
+
if inserted_records >= record_count:
|
61
|
+
break
|
62
|
+
|
63
|
+
progress_bar.close()
|
64
|
+
|
65
|
+
print(f"Dataset saved to {output_file}")
|
66
|
+
|
67
|
+
|
68
|
+
def init_argilla_dataset(client, dataset_name="image_captioning"):
|
69
|
+
workspace_name = "argilla"
|
70
|
+
|
71
|
+
workspace = client.workspaces(workspace_name)
|
72
|
+
|
73
|
+
if workspace is None:
|
74
|
+
workspace = rg.Workspace(name=workspace_name, client=client)
|
75
|
+
workspace.create()
|
76
|
+
print(f"Workspace created: {workspace_name}")
|
77
|
+
|
78
|
+
dataset = client.datasets(name=dataset_name)
|
79
|
+
|
80
|
+
if dataset is not None:
|
81
|
+
return dataset
|
82
|
+
|
83
|
+
settings = rg.Settings(
|
84
|
+
fields=[
|
85
|
+
rg.ImageField(name="image"),
|
86
|
+
rg.TextField(name="filepath")
|
87
|
+
],
|
88
|
+
questions=[
|
89
|
+
rg.TextQuestion(
|
90
|
+
name="text",
|
91
|
+
title="Description of the image",
|
92
|
+
required=True,
|
93
|
+
use_markdown=True,
|
94
|
+
)
|
95
|
+
],
|
96
|
+
)
|
97
|
+
|
98
|
+
dataset = rg.Dataset(
|
99
|
+
name=dataset_name, workspace=workspace_name, settings=settings, client=client
|
100
|
+
)
|
101
|
+
|
102
|
+
dataset.create()
|
103
|
+
print(f"Dataset created: {dataset_name}")
|
104
|
+
|
105
|
+
return dataset
|
106
|
+
|
107
|
+
|
108
|
+
def upload_to_argilla(input_file, batch_size=10, dataset_name="image_captioning"):
|
109
|
+
"""Upload a JSONL dataset to Argilla."""
|
110
|
+
|
111
|
+
client = rg.Argilla(api_url="http://localhost:6900", api_key="argilla.apikey")
|
112
|
+
|
113
|
+
dataset = init_argilla_dataset(client, dataset_name)
|
114
|
+
|
115
|
+
records = []
|
116
|
+
total_records = sum(1 for _ in open(input_file, "r"))
|
117
|
+
|
118
|
+
with open(input_file, "r", encoding="utf-8") as f:
|
119
|
+
progress_bar = tqdm(
|
120
|
+
total=total_records, desc="Uploading to Argilla", unit="record"
|
121
|
+
)
|
122
|
+
|
123
|
+
for line in f:
|
124
|
+
record_data = json.loads(line)
|
125
|
+
image = Image.open(record_data["image"]).convert("RGB")
|
126
|
+
image.thumbnail((1280, 1280))
|
127
|
+
|
128
|
+
rg_record = rg.Record(
|
129
|
+
id=str(record_data["id"]),
|
130
|
+
fields={
|
131
|
+
"image": image,
|
132
|
+
"filepath": record_data["image"],
|
133
|
+
},
|
134
|
+
suggestions=[
|
135
|
+
rg.Suggestion(
|
136
|
+
"text", record_data["answer"], score=1.0, agent="internvl2"
|
137
|
+
)
|
138
|
+
],
|
139
|
+
)
|
140
|
+
records.append(rg_record)
|
141
|
+
|
142
|
+
if len(records) >= batch_size:
|
143
|
+
dataset.records.log(records)
|
144
|
+
progress_bar.update(batch_size)
|
145
|
+
records = []
|
146
|
+
|
147
|
+
if records:
|
148
|
+
dataset.records.log(records)
|
149
|
+
progress_bar.update(len(records))
|
150
|
+
|
151
|
+
progress_bar.close()
|
152
|
+
|
153
|
+
print(f"Dataset uploaded to Argilla: {dataset_name}")
|
154
|
+
|
155
|
+
|
156
|
+
if __name__ == "__main__":
|
157
|
+
parser = argparse.ArgumentParser(description="Prepare and upload dataset")
|
158
|
+
parser.add_argument("--output_file", default="dataset.jsonl", help="Output file path")
|
159
|
+
parser.add_argument("--size", type=int, default=10000, help="Number of records to extract")
|
160
|
+
args = parser.parse_args()
|
161
|
+
|
162
|
+
prepare_huggingface_dataset(args.output_file, record_count=args.size)
|
163
|
+
print(f"Dataset saved to {args.output_file}")
|
164
|
+
# Uncomment the following line if you want to upload to Argilla
|
165
|
+
# upload_to_argilla(args.output_file)
|