remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
rem/utils/files.py ADDED
@@ -0,0 +1,323 @@
1
+ """
2
+ File utilities for consistent file handling throughout REM.
3
+
4
+ Provides context managers and helpers for temporary file operations,
5
+ ensuring proper cleanup and consistent patterns.
6
+
7
+ Also provides DataFrame I/O utilities using Polars with automatic
8
+ format detection based on file extension.
9
+ """
10
+
11
+ import tempfile
12
+ from contextlib import contextmanager
13
+ from io import BytesIO
14
+ from pathlib import Path
15
+ from typing import Generator, Optional, Union
16
+
17
+ import polars as pl
18
+ from loguru import logger
19
+
20
+
21
+ @contextmanager
22
+ def temp_file_from_bytes(
23
+ content: bytes,
24
+ suffix: str = "",
25
+ prefix: str = "rem_",
26
+ dir: Optional[str] = None,
27
+ ) -> Generator[Path, None, None]:
28
+ """
29
+ Create a temporary file from bytes, yield path, cleanup automatically.
30
+
31
+ This context manager ensures proper cleanup of temporary files even
32
+ if an exception occurs during processing.
33
+
34
+ Args:
35
+ content: Bytes to write to the temporary file
36
+ suffix: File extension (e.g., ".pdf", ".wav")
37
+ prefix: Prefix for the temp file name
38
+ dir: Directory for temp file (uses system temp if None)
39
+
40
+ Yields:
41
+ Path to the temporary file
42
+
43
+ Example:
44
+ >>> with temp_file_from_bytes(pdf_bytes, suffix=".pdf") as tmp_path:
45
+ ... result = process_pdf(tmp_path)
46
+ # File is automatically cleaned up after the block
47
+
48
+ Note:
49
+ The file is created with delete=False so we control cleanup.
50
+ This allows the file to be read by external processes.
51
+ """
52
+ tmp_path: Optional[Path] = None
53
+ try:
54
+ with tempfile.NamedTemporaryFile(
55
+ suffix=suffix,
56
+ prefix=prefix,
57
+ dir=dir,
58
+ delete=False,
59
+ ) as tmp:
60
+ tmp.write(content)
61
+ tmp_path = Path(tmp.name)
62
+
63
+ yield tmp_path
64
+
65
+ finally:
66
+ if tmp_path is not None:
67
+ try:
68
+ tmp_path.unlink(missing_ok=True)
69
+ except Exception as e:
70
+ logger.warning(f"Failed to cleanup temp file {tmp_path}: {e}")
71
+
72
+
73
+ @contextmanager
74
+ def temp_file_empty(
75
+ suffix: str = "",
76
+ prefix: str = "rem_",
77
+ dir: Optional[str] = None,
78
+ ) -> Generator[Path, None, None]:
79
+ """
80
+ Create an empty temporary file, yield path, cleanup automatically.
81
+
82
+ Useful when you need to write to a file after creation or when
83
+ an external process will write to the file.
84
+
85
+ Args:
86
+ suffix: File extension
87
+ prefix: Prefix for the temp file name
88
+ dir: Directory for temp file
89
+
90
+ Yields:
91
+ Path to the empty temporary file
92
+ """
93
+ tmp_path: Optional[Path] = None
94
+ try:
95
+ with tempfile.NamedTemporaryFile(
96
+ suffix=suffix,
97
+ prefix=prefix,
98
+ dir=dir,
99
+ delete=False,
100
+ ) as tmp:
101
+ tmp_path = Path(tmp.name)
102
+
103
+ yield tmp_path
104
+
105
+ finally:
106
+ if tmp_path is not None:
107
+ try:
108
+ tmp_path.unlink(missing_ok=True)
109
+ except Exception as e:
110
+ logger.warning(f"Failed to cleanup temp file {tmp_path}: {e}")
111
+
112
+
113
+ @contextmanager
114
+ def temp_directory(
115
+ prefix: str = "rem_",
116
+ dir: Optional[str] = None,
117
+ ) -> Generator[Path, None, None]:
118
+ """
119
+ Create a temporary directory, yield path, cleanup automatically.
120
+
121
+ Args:
122
+ prefix: Prefix for the temp directory name
123
+ dir: Parent directory for temp directory
124
+
125
+ Yields:
126
+ Path to the temporary directory
127
+ """
128
+ import shutil
129
+
130
+ tmp_dir: Optional[Path] = None
131
+ try:
132
+ tmp_dir = Path(tempfile.mkdtemp(prefix=prefix, dir=dir))
133
+ yield tmp_dir
134
+
135
+ finally:
136
+ if tmp_dir is not None:
137
+ try:
138
+ shutil.rmtree(tmp_dir, ignore_errors=True)
139
+ except Exception as e:
140
+ logger.warning(f"Failed to cleanup temp directory {tmp_dir}: {e}")
141
+
142
+
143
+ def ensure_parent_exists(path: Path) -> Path:
144
+ """
145
+ Ensure parent directory exists, creating if necessary.
146
+
147
+ Args:
148
+ path: File path whose parent should exist
149
+
150
+ Returns:
151
+ The original path (for chaining)
152
+ """
153
+ path.parent.mkdir(parents=True, exist_ok=True)
154
+ return path
155
+
156
+
157
+ def safe_delete(path: Path) -> bool:
158
+ """
159
+ Safely delete a file, returning success status.
160
+
161
+ Args:
162
+ path: Path to delete
163
+
164
+ Returns:
165
+ True if deleted or didn't exist, False on error
166
+ """
167
+ try:
168
+ path.unlink(missing_ok=True)
169
+ return True
170
+ except Exception as e:
171
+ logger.warning(f"Failed to delete {path}: {e}")
172
+ return False
173
+
174
+
175
+ # Extension to Polars reader mapping
176
+ _EXTENSION_READERS = {
177
+ ".csv": pl.read_csv,
178
+ ".tsv": lambda p, **kw: pl.read_csv(p, separator="\t", **kw),
179
+ ".parquet": pl.read_parquet,
180
+ ".pq": pl.read_parquet,
181
+ ".json": pl.read_json,
182
+ ".jsonl": pl.read_ndjson,
183
+ ".ndjson": pl.read_ndjson,
184
+ ".avro": pl.read_avro,
185
+ ".xlsx": pl.read_excel,
186
+ ".xls": pl.read_excel,
187
+ ".ods": pl.read_ods,
188
+ ".ipc": pl.read_ipc,
189
+ ".arrow": pl.read_ipc,
190
+ ".feather": pl.read_ipc,
191
+ }
192
+
193
+ # Extension to Polars writer mapping
194
+ _EXTENSION_WRITERS = {
195
+ ".csv": "write_csv",
196
+ ".tsv": "write_csv", # with separator="\t"
197
+ ".parquet": "write_parquet",
198
+ ".pq": "write_parquet",
199
+ ".json": "write_json",
200
+ ".jsonl": "write_ndjson",
201
+ ".ndjson": "write_ndjson",
202
+ ".avro": "write_avro",
203
+ ".xlsx": "write_excel",
204
+ ".ipc": "write_ipc",
205
+ ".arrow": "write_ipc",
206
+ ".feather": "write_ipc",
207
+ }
208
+
209
+
210
+ def read_dataframe(
211
+ source: Union[str, Path, bytes],
212
+ filename: Optional[str] = None,
213
+ **kwargs,
214
+ ) -> pl.DataFrame:
215
+ """
216
+ Read a DataFrame from a file, inferring format from extension.
217
+
218
+ Supports all Polars-compatible formats:
219
+ - CSV (.csv), TSV (.tsv)
220
+ - Parquet (.parquet, .pq)
221
+ - JSON (.json), JSONL/NDJSON (.jsonl, .ndjson)
222
+ - Avro (.avro)
223
+ - Excel (.xlsx, .xls)
224
+ - OpenDocument (.ods)
225
+ - Arrow IPC (.ipc, .arrow, .feather)
226
+
227
+ Args:
228
+ source: File path (str/Path) or bytes content
229
+ filename: Required when source is bytes, to determine format
230
+ **kwargs: Additional arguments passed to the Polars reader
231
+
232
+ Returns:
233
+ Polars DataFrame
234
+
235
+ Raises:
236
+ ValueError: If format cannot be determined or is unsupported
237
+
238
+ Examples:
239
+ >>> df = read_dataframe("data.csv")
240
+ >>> df = read_dataframe("data.parquet")
241
+ >>> df = read_dataframe(csv_bytes, filename="data.csv")
242
+ """
243
+ # Determine the file extension
244
+ if isinstance(source, bytes):
245
+ if not filename:
246
+ raise ValueError("filename is required when source is bytes")
247
+ ext = Path(filename).suffix.lower()
248
+ # For bytes, we need to wrap in BytesIO
249
+ file_like = BytesIO(source)
250
+ else:
251
+ path = Path(source)
252
+ ext = path.suffix.lower()
253
+ file_like = path
254
+
255
+ # Get the appropriate reader
256
+ reader = _EXTENSION_READERS.get(ext)
257
+ if reader is None:
258
+ supported = ", ".join(sorted(_EXTENSION_READERS.keys()))
259
+ raise ValueError(
260
+ f"Unsupported file format: {ext}. "
261
+ f"Supported formats: {supported}"
262
+ )
263
+
264
+ try:
265
+ return reader(file_like, **kwargs)
266
+ except Exception as e:
267
+ logger.error(f"Failed to read DataFrame from {ext} format: {e}")
268
+ raise
269
+
270
+
271
+ def write_dataframe(
272
+ df: pl.DataFrame,
273
+ dest: Union[str, Path],
274
+ **kwargs,
275
+ ) -> None:
276
+ """
277
+ Write a DataFrame to a file, inferring format from extension.
278
+
279
+ Supports most Polars-writable formats:
280
+ - CSV (.csv), TSV (.tsv)
281
+ - Parquet (.parquet, .pq)
282
+ - JSON (.json), JSONL/NDJSON (.jsonl, .ndjson)
283
+ - Avro (.avro)
284
+ - Excel (.xlsx)
285
+ - Arrow IPC (.ipc, .arrow, .feather)
286
+
287
+ Args:
288
+ df: Polars DataFrame to write
289
+ dest: Destination file path
290
+ **kwargs: Additional arguments passed to the Polars writer
291
+
292
+ Raises:
293
+ ValueError: If format cannot be determined or is unsupported
294
+
295
+ Examples:
296
+ >>> write_dataframe(df, "output.csv")
297
+ >>> write_dataframe(df, "output.parquet")
298
+ >>> write_dataframe(df, "output.jsonl")
299
+ """
300
+ path = Path(dest)
301
+ ext = path.suffix.lower()
302
+
303
+ writer_method = _EXTENSION_WRITERS.get(ext)
304
+ if writer_method is None:
305
+ supported = ", ".join(sorted(_EXTENSION_WRITERS.keys()))
306
+ raise ValueError(
307
+ f"Unsupported file format for writing: {ext}. "
308
+ f"Supported formats: {supported}"
309
+ )
310
+
311
+ # Ensure parent directory exists
312
+ ensure_parent_exists(path)
313
+
314
+ # Handle TSV special case
315
+ if ext == ".tsv":
316
+ kwargs.setdefault("separator", "\t")
317
+
318
+ try:
319
+ writer = getattr(df, writer_method)
320
+ writer(path, **kwargs)
321
+ except Exception as e:
322
+ logger.error(f"Failed to write DataFrame to {ext} format: {e}")
323
+ raise
rem/utils/markdown.py ADDED
@@ -0,0 +1,16 @@
1
+ """Markdown conversion utilities for document processing."""
2
+
3
+
4
+ def to_markdown(content: str, filename: str) -> str:
5
+ """
6
+ Convert extracted content to structured markdown.
7
+
8
+ Args:
9
+ content: Extracted text content
10
+ filename: Source filename
11
+
12
+ Returns:
13
+ Structured markdown string with header
14
+ """
15
+ lines = [f"# {filename}\n", content]
16
+ return "\n".join(lines)
@@ -0,0 +1,158 @@
1
+ """
2
+ Centralized MIME type mappings for file format detection.
3
+
4
+ Provides bidirectional mappings between file extensions and MIME types.
5
+ Use these constants throughout the codebase instead of inline dictionaries.
6
+ """
7
+
8
+ # Extension to MIME type mapping (extension includes leading dot)
9
+ EXTENSION_TO_MIME: dict[str, str] = {
10
+ # Images
11
+ ".png": "image/png",
12
+ ".jpg": "image/jpeg",
13
+ ".jpeg": "image/jpeg",
14
+ ".gif": "image/gif",
15
+ ".webp": "image/webp",
16
+ ".bmp": "image/bmp",
17
+ ".tiff": "image/tiff",
18
+ ".svg": "image/svg+xml",
19
+ # Documents
20
+ ".pdf": "application/pdf",
21
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
22
+ ".doc": "application/msword",
23
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
24
+ ".ppt": "application/vnd.ms-powerpoint",
25
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
26
+ ".xls": "application/vnd.ms-excel",
27
+ # Audio
28
+ ".wav": "audio/wav",
29
+ ".mp3": "audio/mpeg",
30
+ ".m4a": "audio/x-m4a",
31
+ ".flac": "audio/flac",
32
+ ".ogg": "audio/ogg",
33
+ ".aac": "audio/aac",
34
+ # Video
35
+ ".mp4": "video/mp4",
36
+ ".webm": "video/webm",
37
+ ".avi": "video/x-msvideo",
38
+ ".mov": "video/quicktime",
39
+ # Text/Code
40
+ ".txt": "text/plain",
41
+ ".md": "text/markdown",
42
+ ".markdown": "text/markdown",
43
+ ".json": "application/json",
44
+ ".yaml": "application/x-yaml",
45
+ ".yml": "application/x-yaml",
46
+ ".xml": "application/xml",
47
+ ".html": "text/html",
48
+ ".css": "text/css",
49
+ ".js": "application/javascript",
50
+ ".py": "text/x-python",
51
+ ".ts": "application/typescript",
52
+ ".csv": "text/csv",
53
+ }
54
+
55
+ # MIME type to extension mapping (reverse of above, preferring shorter extensions)
56
+ MIME_TO_EXTENSION: dict[str, str] = {
57
+ # Images
58
+ "image/png": ".png",
59
+ "image/jpeg": ".jpg",
60
+ "image/gif": ".gif",
61
+ "image/webp": ".webp",
62
+ "image/bmp": ".bmp",
63
+ "image/tiff": ".tiff",
64
+ "image/svg+xml": ".svg",
65
+ # Documents
66
+ "application/pdf": ".pdf",
67
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
68
+ "application/msword": ".doc",
69
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
70
+ "application/vnd.ms-powerpoint": ".ppt",
71
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
72
+ "application/vnd.ms-excel": ".xls",
73
+ # Audio
74
+ "audio/wav": ".wav",
75
+ "audio/mpeg": ".mp3",
76
+ "audio/x-m4a": ".m4a",
77
+ "audio/mp4": ".m4a",
78
+ "audio/flac": ".flac",
79
+ "audio/ogg": ".ogg",
80
+ "audio/aac": ".aac",
81
+ # Video
82
+ "video/mp4": ".mp4",
83
+ "video/webm": ".webm",
84
+ "video/x-msvideo": ".avi",
85
+ "video/quicktime": ".mov",
86
+ # Text/Code
87
+ "text/plain": ".txt",
88
+ "text/markdown": ".md",
89
+ "application/json": ".json",
90
+ "application/x-yaml": ".yaml",
91
+ "application/xml": ".xml",
92
+ "text/html": ".html",
93
+ "text/css": ".css",
94
+ "application/javascript": ".js",
95
+ "text/x-python": ".py",
96
+ "application/typescript": ".ts",
97
+ "text/csv": ".csv",
98
+ }
99
+
100
+ # Grouped by category for convenience
101
+ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg"}
102
+ DOCUMENT_EXTENSIONS = {".pdf", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"}
103
+ AUDIO_EXTENSIONS = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac"}
104
+ VIDEO_EXTENSIONS = {".mp4", ".webm", ".avi", ".mov"}
105
+ TEXT_EXTENSIONS = {".txt", ".md", ".markdown", ".json", ".yaml", ".yml", ".xml", ".html", ".css", ".js", ".py", ".ts", ".csv"}
106
+
107
+
108
+ def get_extension(mime_type: str, default: str = ".bin") -> str:
109
+ """
110
+ Get file extension for a MIME type.
111
+
112
+ Args:
113
+ mime_type: MIME type string (e.g., "image/png")
114
+ default: Default extension if MIME type not found
115
+
116
+ Returns:
117
+ File extension with leading dot (e.g., ".png")
118
+ """
119
+ return MIME_TO_EXTENSION.get(mime_type, default)
120
+
121
+
122
+ def get_mime_type(extension: str, default: str = "application/octet-stream") -> str:
123
+ """
124
+ Get MIME type for a file extension.
125
+
126
+ Args:
127
+ extension: File extension with or without leading dot
128
+ default: Default MIME type if extension not found
129
+
130
+ Returns:
131
+ MIME type string (e.g., "image/png")
132
+ """
133
+ # Normalize extension to have leading dot
134
+ ext = extension if extension.startswith(".") else f".{extension}"
135
+ return EXTENSION_TO_MIME.get(ext.lower(), default)
136
+
137
+
138
+ def is_image(extension_or_mime: str) -> bool:
139
+ """Check if extension or MIME type represents an image."""
140
+ if extension_or_mime.startswith("."):
141
+ return extension_or_mime.lower() in IMAGE_EXTENSIONS
142
+ return extension_or_mime.startswith("image/")
143
+
144
+
145
+ def is_audio(extension_or_mime: str) -> bool:
146
+ """Check if extension or MIME type represents audio."""
147
+ if extension_or_mime.startswith("."):
148
+ return extension_or_mime.lower() in AUDIO_EXTENSIONS
149
+ return extension_or_mime.startswith("audio/")
150
+
151
+
152
+ def is_document(extension_or_mime: str) -> bool:
153
+ """Check if extension or MIME type represents a document."""
154
+ if extension_or_mime.startswith("."):
155
+ return extension_or_mime.lower() in DOCUMENT_EXTENSIONS
156
+ # Check common document MIME types
157
+ doc_mimes = {"application/pdf", "application/msword"}
158
+ return extension_or_mime in doc_mimes or "officedocument" in extension_or_mime