remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
rem/utils/files.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File utilities for consistent file handling throughout REM.
|
|
3
|
+
|
|
4
|
+
Provides context managers and helpers for temporary file operations,
|
|
5
|
+
ensuring proper cleanup and consistent patterns.
|
|
6
|
+
|
|
7
|
+
Also provides DataFrame I/O utilities using Polars with automatic
|
|
8
|
+
format detection based on file extension.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import tempfile
|
|
12
|
+
from contextlib import contextmanager
|
|
13
|
+
from io import BytesIO
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Generator, Optional, Union
|
|
16
|
+
|
|
17
|
+
import polars as pl
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@contextmanager
|
|
22
|
+
def temp_file_from_bytes(
|
|
23
|
+
content: bytes,
|
|
24
|
+
suffix: str = "",
|
|
25
|
+
prefix: str = "rem_",
|
|
26
|
+
dir: Optional[str] = None,
|
|
27
|
+
) -> Generator[Path, None, None]:
|
|
28
|
+
"""
|
|
29
|
+
Create a temporary file from bytes, yield path, cleanup automatically.
|
|
30
|
+
|
|
31
|
+
This context manager ensures proper cleanup of temporary files even
|
|
32
|
+
if an exception occurs during processing.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
content: Bytes to write to the temporary file
|
|
36
|
+
suffix: File extension (e.g., ".pdf", ".wav")
|
|
37
|
+
prefix: Prefix for the temp file name
|
|
38
|
+
dir: Directory for temp file (uses system temp if None)
|
|
39
|
+
|
|
40
|
+
Yields:
|
|
41
|
+
Path to the temporary file
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> with temp_file_from_bytes(pdf_bytes, suffix=".pdf") as tmp_path:
|
|
45
|
+
... result = process_pdf(tmp_path)
|
|
46
|
+
# File is automatically cleaned up after the block
|
|
47
|
+
|
|
48
|
+
Note:
|
|
49
|
+
The file is created with delete=False so we control cleanup.
|
|
50
|
+
This allows the file to be read by external processes.
|
|
51
|
+
"""
|
|
52
|
+
tmp_path: Optional[Path] = None
|
|
53
|
+
try:
|
|
54
|
+
with tempfile.NamedTemporaryFile(
|
|
55
|
+
suffix=suffix,
|
|
56
|
+
prefix=prefix,
|
|
57
|
+
dir=dir,
|
|
58
|
+
delete=False,
|
|
59
|
+
) as tmp:
|
|
60
|
+
tmp.write(content)
|
|
61
|
+
tmp_path = Path(tmp.name)
|
|
62
|
+
|
|
63
|
+
yield tmp_path
|
|
64
|
+
|
|
65
|
+
finally:
|
|
66
|
+
if tmp_path is not None:
|
|
67
|
+
try:
|
|
68
|
+
tmp_path.unlink(missing_ok=True)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.warning(f"Failed to cleanup temp file {tmp_path}: {e}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@contextmanager
|
|
74
|
+
def temp_file_empty(
|
|
75
|
+
suffix: str = "",
|
|
76
|
+
prefix: str = "rem_",
|
|
77
|
+
dir: Optional[str] = None,
|
|
78
|
+
) -> Generator[Path, None, None]:
|
|
79
|
+
"""
|
|
80
|
+
Create an empty temporary file, yield path, cleanup automatically.
|
|
81
|
+
|
|
82
|
+
Useful when you need to write to a file after creation or when
|
|
83
|
+
an external process will write to the file.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
suffix: File extension
|
|
87
|
+
prefix: Prefix for the temp file name
|
|
88
|
+
dir: Directory for temp file
|
|
89
|
+
|
|
90
|
+
Yields:
|
|
91
|
+
Path to the empty temporary file
|
|
92
|
+
"""
|
|
93
|
+
tmp_path: Optional[Path] = None
|
|
94
|
+
try:
|
|
95
|
+
with tempfile.NamedTemporaryFile(
|
|
96
|
+
suffix=suffix,
|
|
97
|
+
prefix=prefix,
|
|
98
|
+
dir=dir,
|
|
99
|
+
delete=False,
|
|
100
|
+
) as tmp:
|
|
101
|
+
tmp_path = Path(tmp.name)
|
|
102
|
+
|
|
103
|
+
yield tmp_path
|
|
104
|
+
|
|
105
|
+
finally:
|
|
106
|
+
if tmp_path is not None:
|
|
107
|
+
try:
|
|
108
|
+
tmp_path.unlink(missing_ok=True)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.warning(f"Failed to cleanup temp file {tmp_path}: {e}")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@contextmanager
|
|
114
|
+
def temp_directory(
|
|
115
|
+
prefix: str = "rem_",
|
|
116
|
+
dir: Optional[str] = None,
|
|
117
|
+
) -> Generator[Path, None, None]:
|
|
118
|
+
"""
|
|
119
|
+
Create a temporary directory, yield path, cleanup automatically.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
prefix: Prefix for the temp directory name
|
|
123
|
+
dir: Parent directory for temp directory
|
|
124
|
+
|
|
125
|
+
Yields:
|
|
126
|
+
Path to the temporary directory
|
|
127
|
+
"""
|
|
128
|
+
import shutil
|
|
129
|
+
|
|
130
|
+
tmp_dir: Optional[Path] = None
|
|
131
|
+
try:
|
|
132
|
+
tmp_dir = Path(tempfile.mkdtemp(prefix=prefix, dir=dir))
|
|
133
|
+
yield tmp_dir
|
|
134
|
+
|
|
135
|
+
finally:
|
|
136
|
+
if tmp_dir is not None:
|
|
137
|
+
try:
|
|
138
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(f"Failed to cleanup temp directory {tmp_dir}: {e}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def ensure_parent_exists(path: Path) -> Path:
|
|
144
|
+
"""
|
|
145
|
+
Ensure parent directory exists, creating if necessary.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
path: File path whose parent should exist
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
The original path (for chaining)
|
|
152
|
+
"""
|
|
153
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
return path
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def safe_delete(path: Path) -> bool:
|
|
158
|
+
"""
|
|
159
|
+
Safely delete a file, returning success status.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
path: Path to delete
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
True if deleted or didn't exist, False on error
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
path.unlink(missing_ok=True)
|
|
169
|
+
return True
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.warning(f"Failed to delete {path}: {e}")
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# Extension to Polars reader mapping
|
|
176
|
+
_EXTENSION_READERS = {
|
|
177
|
+
".csv": pl.read_csv,
|
|
178
|
+
".tsv": lambda p, **kw: pl.read_csv(p, separator="\t", **kw),
|
|
179
|
+
".parquet": pl.read_parquet,
|
|
180
|
+
".pq": pl.read_parquet,
|
|
181
|
+
".json": pl.read_json,
|
|
182
|
+
".jsonl": pl.read_ndjson,
|
|
183
|
+
".ndjson": pl.read_ndjson,
|
|
184
|
+
".avro": pl.read_avro,
|
|
185
|
+
".xlsx": pl.read_excel,
|
|
186
|
+
".xls": pl.read_excel,
|
|
187
|
+
".ods": pl.read_ods,
|
|
188
|
+
".ipc": pl.read_ipc,
|
|
189
|
+
".arrow": pl.read_ipc,
|
|
190
|
+
".feather": pl.read_ipc,
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
# Extension to Polars writer mapping
|
|
194
|
+
_EXTENSION_WRITERS = {
|
|
195
|
+
".csv": "write_csv",
|
|
196
|
+
".tsv": "write_csv", # with separator="\t"
|
|
197
|
+
".parquet": "write_parquet",
|
|
198
|
+
".pq": "write_parquet",
|
|
199
|
+
".json": "write_json",
|
|
200
|
+
".jsonl": "write_ndjson",
|
|
201
|
+
".ndjson": "write_ndjson",
|
|
202
|
+
".avro": "write_avro",
|
|
203
|
+
".xlsx": "write_excel",
|
|
204
|
+
".ipc": "write_ipc",
|
|
205
|
+
".arrow": "write_ipc",
|
|
206
|
+
".feather": "write_ipc",
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def read_dataframe(
|
|
211
|
+
source: Union[str, Path, bytes],
|
|
212
|
+
filename: Optional[str] = None,
|
|
213
|
+
**kwargs,
|
|
214
|
+
) -> pl.DataFrame:
|
|
215
|
+
"""
|
|
216
|
+
Read a DataFrame from a file, inferring format from extension.
|
|
217
|
+
|
|
218
|
+
Supports all Polars-compatible formats:
|
|
219
|
+
- CSV (.csv), TSV (.tsv)
|
|
220
|
+
- Parquet (.parquet, .pq)
|
|
221
|
+
- JSON (.json), JSONL/NDJSON (.jsonl, .ndjson)
|
|
222
|
+
- Avro (.avro)
|
|
223
|
+
- Excel (.xlsx, .xls)
|
|
224
|
+
- OpenDocument (.ods)
|
|
225
|
+
- Arrow IPC (.ipc, .arrow, .feather)
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
source: File path (str/Path) or bytes content
|
|
229
|
+
filename: Required when source is bytes, to determine format
|
|
230
|
+
**kwargs: Additional arguments passed to the Polars reader
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Polars DataFrame
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ValueError: If format cannot be determined or is unsupported
|
|
237
|
+
|
|
238
|
+
Examples:
|
|
239
|
+
>>> df = read_dataframe("data.csv")
|
|
240
|
+
>>> df = read_dataframe("data.parquet")
|
|
241
|
+
>>> df = read_dataframe(csv_bytes, filename="data.csv")
|
|
242
|
+
"""
|
|
243
|
+
# Determine the file extension
|
|
244
|
+
if isinstance(source, bytes):
|
|
245
|
+
if not filename:
|
|
246
|
+
raise ValueError("filename is required when source is bytes")
|
|
247
|
+
ext = Path(filename).suffix.lower()
|
|
248
|
+
# For bytes, we need to wrap in BytesIO
|
|
249
|
+
file_like = BytesIO(source)
|
|
250
|
+
else:
|
|
251
|
+
path = Path(source)
|
|
252
|
+
ext = path.suffix.lower()
|
|
253
|
+
file_like = path
|
|
254
|
+
|
|
255
|
+
# Get the appropriate reader
|
|
256
|
+
reader = _EXTENSION_READERS.get(ext)
|
|
257
|
+
if reader is None:
|
|
258
|
+
supported = ", ".join(sorted(_EXTENSION_READERS.keys()))
|
|
259
|
+
raise ValueError(
|
|
260
|
+
f"Unsupported file format: {ext}. "
|
|
261
|
+
f"Supported formats: {supported}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
return reader(file_like, **kwargs)
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(f"Failed to read DataFrame from {ext} format: {e}")
|
|
268
|
+
raise
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def write_dataframe(
|
|
272
|
+
df: pl.DataFrame,
|
|
273
|
+
dest: Union[str, Path],
|
|
274
|
+
**kwargs,
|
|
275
|
+
) -> None:
|
|
276
|
+
"""
|
|
277
|
+
Write a DataFrame to a file, inferring format from extension.
|
|
278
|
+
|
|
279
|
+
Supports most Polars-writable formats:
|
|
280
|
+
- CSV (.csv), TSV (.tsv)
|
|
281
|
+
- Parquet (.parquet, .pq)
|
|
282
|
+
- JSON (.json), JSONL/NDJSON (.jsonl, .ndjson)
|
|
283
|
+
- Avro (.avro)
|
|
284
|
+
- Excel (.xlsx)
|
|
285
|
+
- Arrow IPC (.ipc, .arrow, .feather)
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
df: Polars DataFrame to write
|
|
289
|
+
dest: Destination file path
|
|
290
|
+
**kwargs: Additional arguments passed to the Polars writer
|
|
291
|
+
|
|
292
|
+
Raises:
|
|
293
|
+
ValueError: If format cannot be determined or is unsupported
|
|
294
|
+
|
|
295
|
+
Examples:
|
|
296
|
+
>>> write_dataframe(df, "output.csv")
|
|
297
|
+
>>> write_dataframe(df, "output.parquet")
|
|
298
|
+
>>> write_dataframe(df, "output.jsonl")
|
|
299
|
+
"""
|
|
300
|
+
path = Path(dest)
|
|
301
|
+
ext = path.suffix.lower()
|
|
302
|
+
|
|
303
|
+
writer_method = _EXTENSION_WRITERS.get(ext)
|
|
304
|
+
if writer_method is None:
|
|
305
|
+
supported = ", ".join(sorted(_EXTENSION_WRITERS.keys()))
|
|
306
|
+
raise ValueError(
|
|
307
|
+
f"Unsupported file format for writing: {ext}. "
|
|
308
|
+
f"Supported formats: {supported}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Ensure parent directory exists
|
|
312
|
+
ensure_parent_exists(path)
|
|
313
|
+
|
|
314
|
+
# Handle TSV special case
|
|
315
|
+
if ext == ".tsv":
|
|
316
|
+
kwargs.setdefault("separator", "\t")
|
|
317
|
+
|
|
318
|
+
try:
|
|
319
|
+
writer = getattr(df, writer_method)
|
|
320
|
+
writer(path, **kwargs)
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.error(f"Failed to write DataFrame to {ext} format: {e}")
|
|
323
|
+
raise
|
rem/utils/markdown.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Markdown conversion utilities for document processing."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def to_markdown(content: str, filename: str) -> str:
|
|
5
|
+
"""
|
|
6
|
+
Convert extracted content to structured markdown.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
content: Extracted text content
|
|
10
|
+
filename: Source filename
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
Structured markdown string with header
|
|
14
|
+
"""
|
|
15
|
+
lines = [f"# {filename}\n", content]
|
|
16
|
+
return "\n".join(lines)
|
rem/utils/mime_types.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Centralized MIME type mappings for file format detection.
|
|
3
|
+
|
|
4
|
+
Provides bidirectional mappings between file extensions and MIME types.
|
|
5
|
+
Use these constants throughout the codebase instead of inline dictionaries.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Extension to MIME type mapping (extension includes leading dot)
|
|
9
|
+
EXTENSION_TO_MIME: dict[str, str] = {
|
|
10
|
+
# Images
|
|
11
|
+
".png": "image/png",
|
|
12
|
+
".jpg": "image/jpeg",
|
|
13
|
+
".jpeg": "image/jpeg",
|
|
14
|
+
".gif": "image/gif",
|
|
15
|
+
".webp": "image/webp",
|
|
16
|
+
".bmp": "image/bmp",
|
|
17
|
+
".tiff": "image/tiff",
|
|
18
|
+
".svg": "image/svg+xml",
|
|
19
|
+
# Documents
|
|
20
|
+
".pdf": "application/pdf",
|
|
21
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
22
|
+
".doc": "application/msword",
|
|
23
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
24
|
+
".ppt": "application/vnd.ms-powerpoint",
|
|
25
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
26
|
+
".xls": "application/vnd.ms-excel",
|
|
27
|
+
# Audio
|
|
28
|
+
".wav": "audio/wav",
|
|
29
|
+
".mp3": "audio/mpeg",
|
|
30
|
+
".m4a": "audio/x-m4a",
|
|
31
|
+
".flac": "audio/flac",
|
|
32
|
+
".ogg": "audio/ogg",
|
|
33
|
+
".aac": "audio/aac",
|
|
34
|
+
# Video
|
|
35
|
+
".mp4": "video/mp4",
|
|
36
|
+
".webm": "video/webm",
|
|
37
|
+
".avi": "video/x-msvideo",
|
|
38
|
+
".mov": "video/quicktime",
|
|
39
|
+
# Text/Code
|
|
40
|
+
".txt": "text/plain",
|
|
41
|
+
".md": "text/markdown",
|
|
42
|
+
".markdown": "text/markdown",
|
|
43
|
+
".json": "application/json",
|
|
44
|
+
".yaml": "application/x-yaml",
|
|
45
|
+
".yml": "application/x-yaml",
|
|
46
|
+
".xml": "application/xml",
|
|
47
|
+
".html": "text/html",
|
|
48
|
+
".css": "text/css",
|
|
49
|
+
".js": "application/javascript",
|
|
50
|
+
".py": "text/x-python",
|
|
51
|
+
".ts": "application/typescript",
|
|
52
|
+
".csv": "text/csv",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# MIME type to extension mapping (reverse of above, preferring shorter extensions)
|
|
56
|
+
MIME_TO_EXTENSION: dict[str, str] = {
|
|
57
|
+
# Images
|
|
58
|
+
"image/png": ".png",
|
|
59
|
+
"image/jpeg": ".jpg",
|
|
60
|
+
"image/gif": ".gif",
|
|
61
|
+
"image/webp": ".webp",
|
|
62
|
+
"image/bmp": ".bmp",
|
|
63
|
+
"image/tiff": ".tiff",
|
|
64
|
+
"image/svg+xml": ".svg",
|
|
65
|
+
# Documents
|
|
66
|
+
"application/pdf": ".pdf",
|
|
67
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
68
|
+
"application/msword": ".doc",
|
|
69
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
70
|
+
"application/vnd.ms-powerpoint": ".ppt",
|
|
71
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
72
|
+
"application/vnd.ms-excel": ".xls",
|
|
73
|
+
# Audio
|
|
74
|
+
"audio/wav": ".wav",
|
|
75
|
+
"audio/mpeg": ".mp3",
|
|
76
|
+
"audio/x-m4a": ".m4a",
|
|
77
|
+
"audio/mp4": ".m4a",
|
|
78
|
+
"audio/flac": ".flac",
|
|
79
|
+
"audio/ogg": ".ogg",
|
|
80
|
+
"audio/aac": ".aac",
|
|
81
|
+
# Video
|
|
82
|
+
"video/mp4": ".mp4",
|
|
83
|
+
"video/webm": ".webm",
|
|
84
|
+
"video/x-msvideo": ".avi",
|
|
85
|
+
"video/quicktime": ".mov",
|
|
86
|
+
# Text/Code
|
|
87
|
+
"text/plain": ".txt",
|
|
88
|
+
"text/markdown": ".md",
|
|
89
|
+
"application/json": ".json",
|
|
90
|
+
"application/x-yaml": ".yaml",
|
|
91
|
+
"application/xml": ".xml",
|
|
92
|
+
"text/html": ".html",
|
|
93
|
+
"text/css": ".css",
|
|
94
|
+
"application/javascript": ".js",
|
|
95
|
+
"text/x-python": ".py",
|
|
96
|
+
"application/typescript": ".ts",
|
|
97
|
+
"text/csv": ".csv",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Grouped by category for convenience
|
|
101
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg"}
|
|
102
|
+
DOCUMENT_EXTENSIONS = {".pdf", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"}
|
|
103
|
+
AUDIO_EXTENSIONS = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac"}
|
|
104
|
+
VIDEO_EXTENSIONS = {".mp4", ".webm", ".avi", ".mov"}
|
|
105
|
+
TEXT_EXTENSIONS = {".txt", ".md", ".markdown", ".json", ".yaml", ".yml", ".xml", ".html", ".css", ".js", ".py", ".ts", ".csv"}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_extension(mime_type: str, default: str = ".bin") -> str:
|
|
109
|
+
"""
|
|
110
|
+
Get file extension for a MIME type.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
mime_type: MIME type string (e.g., "image/png")
|
|
114
|
+
default: Default extension if MIME type not found
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
File extension with leading dot (e.g., ".png")
|
|
118
|
+
"""
|
|
119
|
+
return MIME_TO_EXTENSION.get(mime_type, default)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_mime_type(extension: str, default: str = "application/octet-stream") -> str:
|
|
123
|
+
"""
|
|
124
|
+
Get MIME type for a file extension.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
extension: File extension with or without leading dot
|
|
128
|
+
default: Default MIME type if extension not found
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
MIME type string (e.g., "image/png")
|
|
132
|
+
"""
|
|
133
|
+
# Normalize extension to have leading dot
|
|
134
|
+
ext = extension if extension.startswith(".") else f".{extension}"
|
|
135
|
+
return EXTENSION_TO_MIME.get(ext.lower(), default)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def is_image(extension_or_mime: str) -> bool:
|
|
139
|
+
"""Check if extension or MIME type represents an image."""
|
|
140
|
+
if extension_or_mime.startswith("."):
|
|
141
|
+
return extension_or_mime.lower() in IMAGE_EXTENSIONS
|
|
142
|
+
return extension_or_mime.startswith("image/")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def is_audio(extension_or_mime: str) -> bool:
|
|
146
|
+
"""Check if extension or MIME type represents audio."""
|
|
147
|
+
if extension_or_mime.startswith("."):
|
|
148
|
+
return extension_or_mime.lower() in AUDIO_EXTENSIONS
|
|
149
|
+
return extension_or_mime.startswith("audio/")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def is_document(extension_or_mime: str) -> bool:
|
|
153
|
+
"""Check if extension or MIME type represents a document."""
|
|
154
|
+
if extension_or_mime.startswith("."):
|
|
155
|
+
return extension_or_mime.lower() in DOCUMENT_EXTENSIONS
|
|
156
|
+
# Check common document MIME types
|
|
157
|
+
doc_mimes = {"application/pdf", "application/msword"}
|
|
158
|
+
return extension_or_mime in doc_mimes or "officedocument" in extension_or_mime
|