longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,480 @@
1
+ """ARQ worker tasks for LongParser — extraction and embedding.
2
+
3
+ All tasks are idempotent: upserts by (tenant_id, job_id, block_id / chunk_id).
4
+ Workers check job.status == 'cancelled' between steps.
5
+
6
+ Start with:
7
+ uv run arq longparser.server.worker.WorkerSettings
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import logging
14
+ import uuid
15
+ from pathlib import Path
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ async def extract_job(ctx: dict, tenant_id: str, job_id: str, file_path: str) -> dict:
21
+ """Extract blocks and chunks from a document.
22
+
23
+ Steps:
24
+ 1. Check if already extracted (idempotent)
25
+ 2. Run LongParser pipeline
26
+ 3. Chunk the result
27
+ 4. Upsert blocks + chunks into MongoDB
28
+ 5. Update job status → ready_for_review
29
+ """
30
+ from .db import Database
31
+ from ..pipeline import PipelineOrchestrator
32
+ from ..schemas import ProcessingConfig, ChunkingConfig
33
+
34
+ db = Database()
35
+
36
+ try:
37
+ # Check if job is still valid
38
+ job = await db.get_job(tenant_id, job_id)
39
+ if not job:
40
+ logger.error(f"Job {job_id} not found")
41
+ return {"error": "job_not_found"}
42
+ if job["status"] in ("cancelled", "ready_for_review", "finalized", "indexed"):
43
+ logger.info(f"Job {job_id} already {job['status']}, skipping extraction")
44
+ return {"status": job["status"]}
45
+
46
+ # Update status → extracting
47
+ await db.update_job(tenant_id, job_id, {"status": "extracting"})
48
+
49
+ # Run pipeline
50
+ import os
51
+ do_ocr = os.getenv("LONGPARSER_DO_OCR", "true").lower() in ("true", "1", "yes")
52
+ formula_ocr = os.getenv("LONGPARSER_FORMULA_OCR", "true").lower() in ("true", "1", "yes")
53
+ logger.info(f"[Worker] Extracting {file_path} for job {job_id} (OCR={'on' if do_ocr else 'off'}, formula={'on' if formula_ocr else 'off'})")
54
+ pipeline = PipelineOrchestrator()
55
+ result = pipeline.process_file(Path(file_path), config=ProcessingConfig(do_ocr=do_ocr, formula_ocr=formula_ocr))
56
+
57
+ # Check cancellation
58
+ job = await db.get_job(tenant_id, job_id)
59
+ if job and job["status"] == "cancelled":
60
+ logger.info(f"Job {job_id} cancelled during extraction")
61
+ return {"status": "cancelled"}
62
+
63
+ # Upsert blocks into MongoDB (confidence included internally, excluded from API)
64
+ block_count = 0
65
+ for page in result.document.pages:
66
+ for block in page.blocks:
67
+ block_doc = block.model_dump(mode="json")
68
+ block_doc["page_number"] = page.page_number
69
+ # Compute text hash for change detection
70
+ block_doc["text_hash"] = hashlib.sha256(
71
+ block.text.encode()
72
+ ).hexdigest()[:16]
73
+ await db.upsert_block(tenant_id, job_id, block_doc)
74
+ block_count += 1
75
+
76
+ # Chunk
77
+ logger.info(f"[Worker] Chunking {block_count} blocks for job {job_id}")
78
+ from ..chunkers import HybridChunker
79
+ chunker = HybridChunker(ChunkingConfig())
80
+ all_blocks = [
81
+ block for page in result.document.pages for block in page.blocks
82
+ ]
83
+ chunks = chunker.chunk(all_blocks)
84
+
85
+ # Upsert chunks
86
+ chunk_count = 0
87
+ for chunk in chunks:
88
+ chunk_doc = chunk.model_dump(mode="json")
89
+ chunk_doc["text_hash"] = hashlib.sha256(
90
+ chunk.text.encode()
91
+ ).hexdigest()[:16]
92
+ await db.upsert_chunk(tenant_id, job_id, chunk_doc)
93
+ chunk_count += 1
94
+
95
+ # Update job
96
+ await db.update_job(tenant_id, job_id, {
97
+ "status": "ready_for_review",
98
+ "total_pages": len(result.document.pages),
99
+ "total_blocks": block_count,
100
+ "total_chunks": chunk_count,
101
+ "progress": {
102
+ "pages_done": len(result.document.pages),
103
+ "blocks_saved": block_count,
104
+ "chunks_saved": chunk_count,
105
+ "embeddings_done": 0,
106
+ },
107
+ })
108
+
109
+ logger.info(
110
+ f"[Worker] Job {job_id} done: {block_count} blocks, {chunk_count} chunks"
111
+ )
112
+ return {"status": "ready_for_review", "blocks": block_count, "chunks": chunk_count}
113
+
114
+ except Exception as e:
115
+ logger.exception(f"[Worker] Job {job_id} failed: {e}")
116
+ await db.update_job(tenant_id, job_id, {
117
+ "status": "failed",
118
+ "error": str(e),
119
+ })
120
+ return {"error": str(e)}
121
+ finally:
122
+ await db.close()
123
+
124
+
125
+ async def embed_job(
126
+ ctx: dict, tenant_id: str, job_id: str,
127
+ model: str, vector_db: str, collection_name: str, index_version: str,
128
+ provider: str = "huggingface",
129
+ ) -> dict:
130
+ """Embed approved chunks and store in vector DB.
131
+
132
+ Steps:
133
+ 1. Load approved/edited chunks from Mongo
134
+ 2. Embed with sentence-transformers
135
+ 3. Store in chosen vector DB
136
+ 4. Update job status → indexed
137
+ """
138
+ from .db import Database
139
+ from .embeddings import EmbeddingEngine
140
+ from .vectorstores import get_vector_store
141
+
142
+ db = Database()
143
+
144
+ try:
145
+ job = await db.get_job(tenant_id, job_id)
146
+ if not job or job["status"] == "cancelled":
147
+ return {"status": "cancelled"}
148
+
149
+ await db.update_job(tenant_id, job_id, {"status": "embedding"})
150
+
151
+ # Get approved chunks
152
+ chunks = await db.get_approved_chunks(tenant_id, job_id)
153
+ if not chunks:
154
+ await db.update_job(tenant_id, job_id, {"status": "indexed"})
155
+ return {"status": "indexed", "embedded": 0}
156
+
157
+ # Check cancellation
158
+ job = await db.get_job(tenant_id, job_id)
159
+ if job and job["status"] == "cancelled":
160
+ return {"status": "cancelled"}
161
+
162
+ # Grab explicit dimension override if any
163
+ import os
164
+ env_dim = os.getenv("LONGPARSER_EMBED_DIMENSIONS")
165
+ configured_dimensions = int(env_dim) if env_dim else None
166
+
167
+ # Embed
168
+ logger.info(f"[Worker] Embedding {len(chunks)} chunks with {provider}/{model}")
169
+ engine = EmbeddingEngine(
170
+ provider=provider,
171
+ model_name=model,
172
+ dimensions=configured_dimensions
173
+ )
174
+ texts = [
175
+ c.get("edited_text") or c["text"]
176
+ for c in chunks
177
+ ]
178
+ embeddings = engine.embed_chunks(texts)
179
+ dim = len(embeddings[0]) if embeddings else 0
180
+
181
+ # Record index version
182
+ await db.create_index_version(tenant_id, job_id, index_version, {
183
+ "provider": provider,
184
+ "model": model,
185
+ "configured_dimensions": configured_dimensions,
186
+ "dim": dim,
187
+ "normalize": True,
188
+ "distance_metric": "cosine",
189
+ "vector_db": vector_db,
190
+ "collection": collection_name,
191
+ "fingerprint": engine.get_fingerprint(),
192
+ })
193
+
194
+ # Store in vector DB
195
+ store = get_vector_store(
196
+ vector_db,
197
+ collection_name=collection_name,
198
+ index_fingerprint=engine.get_fingerprint(),
199
+ )
200
+
201
+ # Deterministic vector IDs: {tenant_id}:{job_id}:{chunk_id}:{index_version}
202
+ ids = [
203
+ f"{tenant_id}:{job_id}:{c['chunk_id']}:{index_version}"
204
+ for c in chunks
205
+ ]
206
+ metadatas = [
207
+ {
208
+ "tenant_id": tenant_id,
209
+ "job_id": job_id,
210
+ "chunk_id": c["chunk_id"],
211
+ "chunk_type": c.get("chunk_type", ""),
212
+ "section_path": c.get("section_path", []),
213
+ "page_numbers": c.get("page_numbers", []),
214
+ "block_ids": c.get("block_ids", []),
215
+ "index_version": index_version,
216
+ "text_hash": c.get("text_hash", ""),
217
+ }
218
+ for c in chunks
219
+ ]
220
+
221
+ store.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=texts)
222
+
223
+ # Update index version + job
224
+ await db.index_versions.update_one(
225
+ {"tenant_id": tenant_id, "job_id": job_id, "index_version": index_version},
226
+ {"$set": {"status": "indexed"}},
227
+ )
228
+ await db.update_job(tenant_id, job_id, {
229
+ "status": "indexed",
230
+ "progress.embeddings_done": len(chunks),
231
+ })
232
+
233
+ logger.info(f"[Worker] Job {job_id} indexed: {len(chunks)} vectors in {vector_db}")
234
+ return {"status": "indexed", "embedded": len(chunks)}
235
+
236
+ except Exception as e:
237
+ logger.exception(f"[Worker] Embed job {job_id} failed: {e}")
238
+ await db.update_job(tenant_id, job_id, {
239
+ "status": "failed",
240
+ "error": str(e),
241
+ })
242
+ return {"error": str(e)}
243
+ finally:
244
+ await db.close()
245
+
246
+
247
+ # ---------------------------------------------------------------------------
248
+ # Chat Background Tasks
249
+ # ---------------------------------------------------------------------------
250
+
251
+ async def summarize_session(ctx: dict, tenant_id: str, session_id: str) -> dict:
252
+ """Compress older turns into a rolling summary (mid-term memory).
253
+
254
+ Steps:
255
+ 1. Get session + unarchived turns
256
+ 2. Keep last N as short-term; summarize the rest
257
+ 3. Update rolling_summary with optimistic lock
258
+ 4. Archive summarized turns
259
+ """
260
+ from .db import Database
261
+ from .schemas import ChatConfig
262
+ from .llm_chain import get_plain_chat_model
263
+ from langchain_core.messages import SystemMessage, HumanMessage
264
+
265
+ db = Database()
266
+ config = ChatConfig()
267
+ llm = get_plain_chat_model(config=config)
268
+
269
+ try:
270
+ session = await db.get_chat_session(tenant_id, session_id)
271
+ if not session:
272
+ return {"error": "session_not_found"}
273
+
274
+ turns = await db.get_unarchived_turns(tenant_id, session_id)
275
+ if len(turns) <= config.short_term_turns:
276
+ return {"status": "skipped", "reason": "not enough turns"}
277
+
278
+ # Keep last N as short-term, summarize the rest
279
+ to_summarize = turns[:-config.short_term_turns]
280
+ if not to_summarize:
281
+ return {"status": "skipped", "reason": "nothing to summarize"}
282
+
283
+ # Build summarization prompt
284
+ existing_summary = session.get("rolling_summary", "")
285
+ turn_text = "\n".join(
286
+ f"User: {t['question']}\nAssistant: {t['answer']}"
287
+ for t in to_summarize
288
+ )
289
+ messages = [
290
+ SystemMessage(content="You are a conversation summarizer. Produce a concise summary that preserves key facts, decisions, and context. Return plain text, no JSON."),
291
+ HumanMessage(content=f"Existing summary:\n{existing_summary}\n\nNew turns to incorporate:\n{turn_text}\n\nProduce an updated summary:"),
292
+ ]
293
+
294
+ response = await llm.ainvoke(messages)
295
+ new_summary = response.content
296
+
297
+ # Update with optimistic lock
298
+ updated = await db.update_rolling_summary(
299
+ tenant_id, session_id, new_summary, session["version"]
300
+ )
301
+ if not updated:
302
+ logger.warning(f"[Worker] Summary version conflict for session {session_id}")
303
+ return {"status": "conflict"}
304
+
305
+ # Archive summarized turns
306
+ turn_ids = [t["turn_id"] for t in to_summarize]
307
+ archived = await db.archive_turns(tenant_id, session_id, turn_ids)
308
+
309
+ logger.info(f"[Worker] Summarized session {session_id}: {archived} turns archived")
310
+ return {"status": "summarized", "archived": archived}
311
+
312
+ except Exception as e:
313
+ logger.exception(f"[Worker] Summarize session {session_id} failed: {e}")
314
+ return {"error": str(e)}
315
+ finally:
316
+ await db.close()
317
+
318
+
319
+ async def extract_facts(
320
+ ctx: dict, tenant_id: str, session_id: str, job_id: str
321
+ ) -> dict:
322
+ """Extract long-term facts from recent conversation (Layer 3 memory).
323
+
324
+ Only persists facts from allowlisted types with chunk provenance.
325
+ """
326
+ from .db import Database
327
+ from .schemas import ChatConfig, FactSourceType
328
+ from .llm_chain import get_chat_model
329
+ from langchain_core.messages import SystemMessage, HumanMessage
330
+
331
+ db = Database()
332
+ config = ChatConfig()
333
+ llm = get_chat_model(config=config, json_mode=False)
334
+
335
+ ALLOWED_FACT_TYPES = {"entities_from_doc", "user_preferences", "decisions"}
336
+
337
+ try:
338
+ session = await db.get_chat_session(tenant_id, session_id)
339
+ if not session:
340
+ return {"error": "session_not_found"}
341
+
342
+ turns = await db.get_recent_turns(tenant_id, session_id, n=20)
343
+ if not turns:
344
+ return {"status": "skipped", "reason": "no turns"}
345
+
346
+ turn_text = "\n".join(
347
+ f"User: {t['question']}\nAssistant: {t['answer']}"
348
+ for t in turns
349
+ )
350
+ messages = [
351
+ SystemMessage(content=(
352
+ "Extract key facts from this conversation. Return JSON:\n"
353
+ '{"facts": [{"type": "entities_from_doc"|"user_preferences"|"decisions", '
354
+ '"source": "doc"|"user", "fact": "...", "confidence": 0.0-1.0}]}\n'
355
+ "Only extract facts clearly stated in the conversation. "
356
+ "Do NOT infer or guess. Maximum 10 facts."
357
+ )),
358
+ HumanMessage(content=turn_text),
359
+ ]
360
+
361
+ response = await llm.ainvoke(messages)
362
+ raw = response.content
363
+
364
+ import json
365
+ try:
366
+ data = json.loads(raw)
367
+ except json.JSONDecodeError:
368
+ return {"error": "invalid_json"}
369
+
370
+ # Filter: only allowlisted types, only doc/user sources
371
+ existing_facts = session.get("long_term_facts", [])
372
+ new_facts = []
373
+ for f in data.get("facts", []):
374
+ if f.get("type") not in ALLOWED_FACT_TYPES:
375
+ continue
376
+ if f.get("source") not in ("doc", "user"):
377
+ continue
378
+ new_facts.append({
379
+ "type": f["type"],
380
+ "source": f["source"],
381
+ "fact": f["fact"],
382
+ "supporting_chunk_ids": [],
383
+ "confidence": f.get("confidence", 0.5),
384
+ })
385
+
386
+ # Merge + cap at max_facts
387
+ merged = existing_facts + new_facts
388
+ merged = merged[-config.max_facts:]
389
+
390
+ updated = await db.update_long_term_facts(
391
+ tenant_id, session_id, merged, session["version"]
392
+ )
393
+ if not updated:
394
+ logger.warning(f"[Worker] Facts version conflict for session {session_id}")
395
+ return {"status": "conflict"}
396
+
397
+ logger.info(f"[Worker] Extracted {len(new_facts)} facts for session {session_id}")
398
+ return {"status": "extracted", "new_facts": len(new_facts), "total": len(merged)}
399
+
400
+ except Exception as e:
401
+ logger.exception(f"[Worker] Extract facts {session_id} failed: {e}")
402
+ return {"error": str(e)}
403
+ finally:
404
+ await db.close()
405
+
406
+
407
+ async def purge_expired_sessions(ctx: dict) -> dict:
408
+ """Scheduled task: hard-delete turns for soft-deleted sessions past TTL."""
409
+ from .db import Database
410
+ from .schemas import ChatConfig
411
+
412
+ db = Database()
413
+ config = ChatConfig()
414
+
415
+ try:
416
+ expired = await db.get_expired_sessions(config.ttl_days)
417
+ purged = 0
418
+ for session in expired:
419
+ count = await db.purge_turns_for_session(
420
+ session["tenant_id"], session["session_id"]
421
+ )
422
+ await db.chat_sessions.delete_one({
423
+ "tenant_id": session["tenant_id"],
424
+ "session_id": session["session_id"],
425
+ })
426
+ purged += count
427
+
428
+ if purged > 0:
429
+ logger.info(f"[Worker] Purged {purged} turns from {len(expired)} expired sessions")
430
+ return {"status": "purged", "sessions": len(expired), "turns": purged}
431
+
432
+ except Exception as e:
433
+ logger.exception(f"[Worker] Purge expired sessions failed: {e}")
434
+ return {"error": str(e)}
435
+ finally:
436
+ await db.close()
437
+
438
+
439
+ # ---------------------------------------------------------------------------
440
+ # ARQ Worker Settings
441
+ # ---------------------------------------------------------------------------
442
+
443
+ class WorkerSettings:
444
+ """ARQ worker configuration — start with `arq longparser.server.worker.WorkerSettings`."""
445
+
446
+ functions = [
447
+ extract_job,
448
+ embed_job,
449
+ summarize_session,
450
+ extract_facts,
451
+ purge_expired_sessions,
452
+ ]
453
+
454
+ # 10-min timeout: ~72s Docling + up to 420s formula OCR + headroom
455
+ job_timeout = 420
456
+ import os
457
+ from arq.connections import RedisSettings
458
+ _redis_url = os.getenv("LONGPARSER_REDIS_URL", "redis://localhost:6379/0")
459
+ redis_settings = RedisSettings.from_dsn(_redis_url)
460
+
461
+ # Scheduled cron tasks
462
+ cron_jobs = None # set below after import
463
+
464
+ @staticmethod
465
+ async def on_startup(ctx: dict) -> None:
466
+ logger.info("[ARQ Worker] Starting up")
467
+
468
+ @staticmethod
469
+ async def on_shutdown(ctx: dict) -> None:
470
+ logger.info("[ARQ Worker] Shutting down")
471
+
472
+
473
+ # Cron: purge expired sessions once per hour
474
+ try:
475
+ from arq import cron
476
+ WorkerSettings.cron_jobs = [
477
+ cron(purge_expired_sessions, hour=None, minute=0), # every hour at :00
478
+ ]
479
+ except ImportError:
480
+ pass # arq cron not available in all versions
@@ -0,0 +1,5 @@
1
+ """Utility modules for LongParser."""
2
+
3
+ from .rtl_detector import detect_rtl_language
4
+
5
+ __all__ = ["detect_rtl_language"]
@@ -0,0 +1,93 @@
1
+ """RTL (Right-to-Left) language detection utility."""
2
+
3
+ import re
4
+ from typing import Optional
5
+
6
+
7
+ # Unicode ranges for RTL scripts
8
+ RTL_RANGES = [
9
+ (0x0600, 0x06FF), # Arabic
10
+ (0x0750, 0x077F), # Arabic Supplement
11
+ (0x08A0, 0x08FF), # Arabic Extended-A
12
+ (0xFB50, 0xFDFF), # Arabic Presentation Forms-A
13
+ (0xFE70, 0xFEFF), # Arabic Presentation Forms-B
14
+ (0x0590, 0x05FF), # Hebrew
15
+ (0xFB00, 0xFB4F), # Hebrew Presentation Forms
16
+ (0x0700, 0x074F), # Syriac
17
+ (0x0780, 0x07BF), # Thaana (Maldivian)
18
+ (0x0840, 0x085F), # Mandaic
19
+ ]
20
+
21
+ # Compile regex pattern for RTL detection
22
+ RTL_PATTERN = re.compile(
23
+ '[' + ''.join(
24
+ f'\\u{start:04x}-\\u{end:04x}'
25
+ for start, end in RTL_RANGES
26
+ ) + ']'
27
+ )
28
+
29
+
30
+ def detect_rtl_language(text: str, threshold: float = 0.1) -> bool:
31
+ """
32
+ Detect if text contains significant RTL content.
33
+
34
+ Args:
35
+ text: Text to analyze
36
+ threshold: Minimum ratio of RTL characters to consider text as RTL
37
+
38
+ Returns:
39
+ True if text is predominantly RTL
40
+ """
41
+ if not text:
42
+ return False
43
+
44
+ # Count RTL characters
45
+ rtl_chars = len(RTL_PATTERN.findall(text))
46
+
47
+ # Count total alphabetic characters (excluding spaces, numbers, punctuation)
48
+ alpha_chars = sum(1 for c in text if c.isalpha())
49
+
50
+ if alpha_chars == 0:
51
+ return False
52
+
53
+ rtl_ratio = rtl_chars / alpha_chars
54
+ return rtl_ratio >= threshold
55
+
56
+
57
+ def detect_rtl_script(text: str) -> Optional[str]:
58
+ """
59
+ Detect the specific RTL script in text.
60
+
61
+ Args:
62
+ text: Text to analyze
63
+
64
+ Returns:
65
+ Script name ('arabic', 'hebrew', 'urdu', etc.) or None
66
+ """
67
+ if not text:
68
+ return None
69
+
70
+ # Arabic/Urdu detection (same script, different languages)
71
+ arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]')
72
+ arabic_count = len(arabic_pattern.findall(text))
73
+
74
+ # Hebrew detection
75
+ hebrew_pattern = re.compile(r'[\u0590-\u05FF]')
76
+ hebrew_count = len(hebrew_pattern.findall(text))
77
+
78
+ if arabic_count > hebrew_count and arabic_count > 0:
79
+ # Check for Urdu-specific characters (some unique to Urdu)
80
+ urdu_specific = re.compile(r'[\u0679\u067E\u0686\u0688\u0691\u0698\u06A9\u06AF\u06BA\u06BE\u06C1\u06C3\u06CC\u06D2]')
81
+ if urdu_specific.search(text):
82
+ return 'urdu'
83
+ return 'arabic'
84
+
85
+ if hebrew_count > 0:
86
+ return 'hebrew'
87
+
88
+ return None
89
+
90
+
91
+ def get_rtl_languages() -> list[str]:
92
+ """Get list of supported RTL language codes."""
93
+ return ['ar', 'he', 'ur', 'fa', 'ps', 'sd', 'yi', 'dv']