docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. docintel/__init__.py +6 -0
  2. docintel/app.py +45 -0
  3. docintel/auth/__init__.py +12 -0
  4. docintel/auth/api_keys.py +48 -0
  5. docintel/auth/limiter.py +41 -0
  6. docintel/auth/middleware.py +34 -0
  7. docintel/auth/oidc.py +45 -0
  8. docintel/cli.py +21 -0
  9. docintel/client.py +193 -0
  10. docintel/config.py +20 -0
  11. docintel/jobs/__init__.py +16 -0
  12. docintel/jobs/helpers.py +38 -0
  13. docintel/jobs/models.py +78 -0
  14. docintel/jobs/queue.py +75 -0
  15. docintel/jobs/store.py +82 -0
  16. docintel/jobs/tasks.py +173 -0
  17. docintel/jobs/webhooks.py +32 -0
  18. docintel/openapi/__init__.py +1 -0
  19. docintel/openapi/openapi.yaml +380 -0
  20. docintel/ops/__init__.py +1 -0
  21. docintel/ops/logging.py +40 -0
  22. docintel/ops/metrics.py +57 -0
  23. docintel/ops/middleware.py +40 -0
  24. docintel/routes/__init__.py +1 -0
  25. docintel/routes/jobs.py +26 -0
  26. docintel/routes/match.py +43 -0
  27. docintel/routes/openapi_docs.py +57 -0
  28. docintel/routes/ops.py +22 -0
  29. docintel/routes/pdf.py +420 -0
  30. docintel/routes/text.py +41 -0
  31. docintel/services/__init__.py +1 -0
  32. docintel/services/matching/__init__.py +6 -0
  33. docintel/services/matching/models.py +19 -0
  34. docintel/services/matching/scorer.py +64 -0
  35. docintel/services/pdf/__init__.py +26 -0
  36. docintel/services/pdf/annotator.py +188 -0
  37. docintel/services/pdf/models.py +104 -0
  38. docintel/services/pdf/ocr.py +130 -0
  39. docintel/services/pdf/pii.py +105 -0
  40. docintel/services/pdf/presets.py +26 -0
  41. docintel/services/pdf/search.py +29 -0
  42. docintel/services/pdf/sensitive.py +212 -0
  43. docintel/services/pdf/structure.py +118 -0
  44. docintel/services/pdf/structure_llm.py +136 -0
  45. docintel/services/pdf/structure_render.py +136 -0
  46. docintel/services/pdf/structure_schema.py +99 -0
  47. docintel/services/summary/__init__.py +6 -0
  48. docintel/services/summary/models.py +21 -0
  49. docintel/services/summary/textrank.py +57 -0
  50. docintel/ui.py +347 -0
  51. docintel/wsgi.py +5 -0
  52. docintel_platform-1.0.2.dist-info/METADATA +607 -0
  53. docintel_platform-1.0.2.dist-info/RECORD +56 -0
  54. docintel_platform-1.0.2.dist-info/WHEEL +5 -0
  55. docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
  56. docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,99 @@
1
+ """Structured document schema for LLM PDF curation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+
9
+ @dataclass
10
+ class TableBlock:
11
+ headers: list[str]
12
+ rows: list[list[str]]
13
+
14
+ @classmethod
15
+ def from_dict(cls, payload: dict[str, Any]) -> "TableBlock":
16
+ return cls(
17
+ headers=[str(item) for item in payload.get("headers", [])],
18
+ rows=[[str(cell) for cell in row] for row in payload.get("rows", [])],
19
+ )
20
+
21
+
22
+ @dataclass
23
+ class SectionBlock:
24
+ heading: str
25
+ level: int
26
+ paragraphs: list[str] = field(default_factory=list)
27
+ list_items: list[str] = field(default_factory=list)
28
+ tables: list[TableBlock] = field(default_factory=list)
29
+
30
+ @classmethod
31
+ def from_dict(cls, payload: dict[str, Any]) -> "SectionBlock":
32
+ tables = [TableBlock.from_dict(item) for item in payload.get("tables", [])]
33
+ return cls(
34
+ heading=str(payload.get("heading", "")).strip(),
35
+ level=max(1, min(6, int(payload.get("level", 1)))),
36
+ paragraphs=[str(item).strip() for item in payload.get("paragraphs", []) if str(item).strip()],
37
+ list_items=[str(item).strip() for item in payload.get("list_items", []) if str(item).strip()],
38
+ tables=tables,
39
+ )
40
+
41
+
42
+ @dataclass
43
+ class StructuredPage:
44
+ page_index: int
45
+ title: str
46
+ sections: list[SectionBlock] = field(default_factory=list)
47
+ plain_text: str = ""
48
+
49
+ @classmethod
50
+ def from_llm_payload(cls, page_index: int, payload: dict[str, Any]) -> "StructuredPage":
51
+ sections = [SectionBlock.from_dict(item) for item in payload.get("sections", [])]
52
+ plain_text = str(payload.get("plain_text", "")).strip()
53
+ if not plain_text:
54
+ plain_text = _sections_to_plain_text(sections)
55
+ return cls(
56
+ page_index=page_index,
57
+ title=str(payload.get("page_title", "")).strip(),
58
+ sections=sections,
59
+ plain_text=plain_text,
60
+ )
61
+
62
+
63
+ @dataclass
64
+ class StructuredDocument:
65
+ title: str
66
+ pages: list[StructuredPage] = field(default_factory=list)
67
+
68
+ @property
69
+ def sections(self) -> list[SectionBlock]:
70
+ merged: list[SectionBlock] = []
71
+ for page in self.pages:
72
+ merged.extend(page.sections)
73
+ return merged
74
+
75
+ @classmethod
76
+ def from_pages(cls, pages: list[StructuredPage]) -> "StructuredDocument":
77
+ title = ""
78
+ for page in pages:
79
+ if page.title:
80
+ title = page.title
81
+ break
82
+ if not title:
83
+ title = "Structured Document"
84
+ return cls(title=title, pages=pages)
85
+
86
+
87
+ def _sections_to_plain_text(sections: list[SectionBlock]) -> str:
88
+ lines: list[str] = []
89
+ for section in sections:
90
+ if section.heading:
91
+ lines.append(section.heading)
92
+ lines.extend(section.paragraphs)
93
+ lines.extend(f"- {item}" for item in section.list_items)
94
+ for table in section.tables:
95
+ if table.headers:
96
+ lines.append(" | ".join(table.headers))
97
+ for row in table.rows:
98
+ lines.append(" | ".join(row))
99
+ return "\n".join(lines)
@@ -0,0 +1,6 @@
1
+ """Text summarization service."""
2
+
3
+ from docintel.services.summary.models import SummaryResult
4
+ from docintel.services.summary.textrank import summarize_text
5
+
6
+ __all__ = ["SummaryResult", "summarize_text"]
@@ -0,0 +1,21 @@
1
+ """Types for text summarization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class SummaryResult:
10
+ summary: str
11
+ sentences: list[str]
12
+ sentence_count: int
13
+ source_sentence_count: int
14
+
15
+ def to_dict(self) -> dict:
16
+ return {
17
+ "summary": self.summary,
18
+ "sentences": self.sentences,
19
+ "sentence_count": self.sentence_count,
20
+ "source_sentence_count": self.source_sentence_count,
21
+ }
@@ -0,0 +1,57 @@
1
+ """TextRank-style extractive summarization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ import networkx as nx
8
+ import numpy as np
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+ from docintel.services.summary.models import SummaryResult
13
+
14
+ DEFAULT_SENTENCE_COUNT = 3
15
+ MAX_SENTENCE_COUNT = 20
16
+
17
+
18
+ def split_sentences(text: str) -> list[str]:
19
+ """Split text into sentences using simple punctuation boundaries."""
20
+ cleaned = " ".join(text.strip().split())
21
+ if not cleaned:
22
+ return []
23
+
24
+ parts = re.split(r"(?<=[.!?])\s+", cleaned)
25
+ return [part.strip() for part in parts if part.strip()]
26
+
27
+
28
+ def summarize_text(text: str, sentence_count: int = DEFAULT_SENTENCE_COUNT) -> SummaryResult:
29
+ """Return an extractive summary using a TextRank graph over sentence similarity."""
30
+ if sentence_count < 1 or sentence_count > MAX_SENTENCE_COUNT:
31
+ raise ValueError(f"sentence_count must be between 1 and {MAX_SENTENCE_COUNT}.")
32
+
33
+ sentences = split_sentences(text)
34
+ if not sentences:
35
+ raise ValueError("Text is required.")
36
+
37
+ if len(sentences) <= sentence_count:
38
+ selected = sentences
39
+ else:
40
+ vectorizer = TfidfVectorizer(stop_words="english")
41
+ matrix = vectorizer.fit_transform(sentences)
42
+ similarity = cosine_similarity(matrix)
43
+ np.fill_diagonal(similarity, 0.0)
44
+
45
+ graph = nx.from_numpy_array(similarity)
46
+ scores = nx.pagerank(graph, weight="weight")
47
+ ranked_indices = sorted(scores.items(), key=lambda item: item[1], reverse=True)
48
+ top_indices = sorted(index for index, _ in ranked_indices[:sentence_count])
49
+ selected = [sentences[index] for index in top_indices]
50
+
51
+ summary = " ".join(selected)
52
+ return SummaryResult(
53
+ summary=summary,
54
+ sentences=selected,
55
+ sentence_count=len(selected),
56
+ source_sentence_count=len(sentences),
57
+ )
docintel/ui.py ADDED
@@ -0,0 +1,347 @@
1
+ """Gradio upload UI for the document intelligence platform."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import tempfile
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import requests
13
+
14
+ API_BASE = os.getenv("DOCINTEL_API_URL", "http://127.0.0.1:5000").rstrip("/")
15
+ API_KEY = os.getenv("DOCINTEL_API_KEY", "")
16
+
17
+
18
+ def _api_headers() -> dict[str, str]:
19
+ if API_KEY.strip():
20
+ return {"Authorization": f"Bearer {API_KEY.strip()}"}
21
+ return {}
22
+ GRADIO_HOST = os.getenv("GRADIO_SERVER_NAME", "127.0.0.1")
23
+ GRADIO_PORT = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
24
+
25
+
26
+ def resolve_upload_path(upload: Any) -> Path | None:
27
+ """Normalize Gradio file upload values to a local path."""
28
+ if upload is None:
29
+ return None
30
+ if isinstance(upload, (str, Path)):
31
+ return Path(upload)
32
+ if isinstance(upload, dict) and upload.get("path"):
33
+ return Path(upload["path"])
34
+ if isinstance(upload, list) and upload:
35
+ return resolve_upload_path(upload[0])
36
+ return None
37
+
38
+
39
+ def _api_error(response: requests.Response) -> str:
40
+ try:
41
+ payload = response.json()
42
+ return payload.get("error", response.text)
43
+ except Exception:
44
+ return response.text or f"HTTP {response.status_code}"
45
+
46
+
47
+ def check_api_health() -> str:
48
+ try:
49
+ response = requests.get(f"{API_BASE}/health", timeout=10)
50
+ if response.ok:
51
+ payload = response.json()
52
+ return f"API online ({payload.get('version', 'unknown')}) at {API_BASE}"
53
+ return f"API unhealthy: {_api_error(response)}"
54
+ except requests.RequestException as exc:
55
+ return f"Cannot reach API at {API_BASE}: {exc}"
56
+
57
+
58
+ def annotate_pdf_ui(pdf_file: Any, pattern: str, action: str) -> tuple[Any, str]:
59
+ path = resolve_upload_path(pdf_file)
60
+ if path is None:
61
+ return None, "Upload a PDF file."
62
+ if not pattern.strip():
63
+ return None, "Enter a search pattern."
64
+ with path.open("rb") as handle:
65
+ response = requests.post(
66
+ f"{API_BASE}/v1/pdf/annotate",
67
+ files={"file": (path.name, handle, "application/pdf")},
68
+ data={"pattern": pattern, "action": action},
69
+ headers=_api_headers(),
70
+ timeout=120,
71
+ )
72
+
73
+ if not response.ok:
74
+ return None, _api_error(response)
75
+
76
+ output = tempfile.NamedTemporaryFile(delete=False, suffix="_annotated.pdf")
77
+ output.write(response.content)
78
+ output.close()
79
+ matches = response.headers.get("X-Docintel-Matches", "?")
80
+ return output.name, f"Annotated PDF ready. Matches: {matches}"
81
+
82
+
83
+ def detect_sensitive_ui(
84
+ pdf_file: Any,
85
+ action: str,
86
+ entities: str,
87
+ force_ocr: bool,
88
+ add_text_layer: bool,
89
+ ) -> tuple[Any, str]:
90
+ path = resolve_upload_path(pdf_file)
91
+ if path is None:
92
+ return None, "Upload a PDF file."
93
+ data = {
94
+ "action": action,
95
+ "force_ocr": str(force_ocr).lower(),
96
+ "add_text_layer": str(add_text_layer).lower(),
97
+ }
98
+ if entities.strip():
99
+ data["entities"] = entities.strip()
100
+
101
+ with path.open("rb") as handle:
102
+ response = requests.post(
103
+ f"{API_BASE}/v1/pdf/detect-sensitive?format=json",
104
+ files={"file": (path.name, handle, "application/pdf")},
105
+ data=data,
106
+ headers=_api_headers(),
107
+ timeout=300,
108
+ )
109
+
110
+ if not response.ok:
111
+ return None, _api_error(response)
112
+
113
+ payload = response.json()
114
+ download = requests.get(
115
+ f"{API_BASE}{payload['download_url']}", headers=_api_headers(), timeout=120
116
+ )
117
+ if not download.ok:
118
+ return None, "Processed PDF could not be downloaded."
119
+
120
+ output = tempfile.NamedTemporaryFile(delete=False, suffix="_sensitive.pdf")
121
+ output.write(download.content)
122
+ output.close()
123
+
124
+ summary = {
125
+ "matches": payload.get("matches"),
126
+ "finding_count": payload.get("finding_count"),
127
+ "ocr_pages": payload.get("ocr_pages"),
128
+ "findings": payload.get("findings", [])[:20],
129
+ }
130
+ return output.name, json.dumps(summary, indent=2)
131
+
132
+
133
+ def match_resume_ui(resume: str, job_description: str, top_keywords: int) -> str:
134
+ if not resume.strip() or not job_description.strip():
135
+ return "Provide both resume and job description text."
136
+
137
+ response = requests.post(
138
+ f"{API_BASE}/v1/match/resume",
139
+ json={
140
+ "resume": resume,
141
+ "job_description": job_description,
142
+ "top_keywords": int(top_keywords),
143
+ },
144
+ headers=_api_headers(),
145
+ timeout=60,
146
+ )
147
+ if not response.ok:
148
+ return _api_error(response)
149
+ return json.dumps(response.json(), indent=2)
150
+
151
+
152
+ def structure_pdf_ui(pdf_file: Any, mode: str, force_ocr: bool) -> tuple[Any, str]:
153
+ path = resolve_upload_path(pdf_file)
154
+ if path is None:
155
+ return None, "Upload a PDF file."
156
+
157
+ with path.open("rb") as handle:
158
+ response = requests.post(
159
+ f"{API_BASE}/v1/pdf/structure?async=true",
160
+ files={"file": (path.name, handle, "application/pdf")},
161
+ data={"mode": mode, "force_ocr": str(force_ocr).lower()},
162
+ headers=_api_headers(),
163
+ timeout=120,
164
+ )
165
+
166
+ if response.status_code == 202:
167
+ payload = response.json()
168
+ poll_url = payload.get("poll_url")
169
+ if not poll_url:
170
+ return None, "Async job started but poll_url is missing."
171
+ for _ in range(300):
172
+ poll = requests.get(f"{API_BASE}{poll_url}", headers=_api_headers(), timeout=30)
173
+ if not poll.ok:
174
+ return None, _api_error(poll)
175
+ job_payload = poll.json()
176
+ job_status = job_payload.get("job_status")
177
+ if job_status == "completed":
178
+ payload = job_payload
179
+ break
180
+ if job_status == "failed":
181
+ return None, job_payload.get("error", "Structure job failed.")
182
+ time.sleep(2)
183
+ else:
184
+ return None, "Structure job timed out while polling."
185
+ elif response.ok:
186
+ payload = response.json()
187
+ else:
188
+ return None, _api_error(response)
189
+
190
+ download_url = payload.get("download_url")
191
+ if not download_url:
192
+ return None, "Structured PDF is not ready yet."
193
+
194
+ download = requests.get(f"{API_BASE}{download_url}", headers=_api_headers(), timeout=120)
195
+ if not download.ok:
196
+ return None, "Structured PDF could not be downloaded."
197
+
198
+ output = tempfile.NamedTemporaryFile(delete=False, suffix="_structured.pdf")
199
+ output.write(download.content)
200
+ output.close()
201
+
202
+ result = payload.get("result") or payload
203
+ summary = {
204
+ "job_status": payload.get("job_status"),
205
+ "mode": result.get("mode"),
206
+ "document_title": result.get("document_title"),
207
+ "pages_processed": result.get("pages_processed"),
208
+ "ocr_pages": result.get("ocr_pages"),
209
+ }
210
+ return output.name, json.dumps(summary, indent=2)
211
+
212
+
213
+ def summarize_text_ui(text: str, sentences: int) -> str:
214
+ if not text.strip():
215
+ return "Provide text to summarize."
216
+
217
+ response = requests.post(
218
+ f"{API_BASE}/v1/text/summarize",
219
+ json={"text": text, "sentences": int(sentences)},
220
+ headers=_api_headers(),
221
+ timeout=60,
222
+ )
223
+ if not response.ok:
224
+ return _api_error(response)
225
+ return json.dumps(response.json(), indent=2)
226
+
227
+
228
+ def build_ui():
229
+ import gradio as gr
230
+
231
+ action_choices = [
232
+ "Highlight",
233
+ "Redact",
234
+ "Frame",
235
+ "Underline",
236
+ "Squiggly",
237
+ "Strikeout",
238
+ ]
239
+
240
+ with gr.Blocks(title="Document Intelligence Platform") as demo:
241
+ gr.Markdown(
242
+ "# Document Intelligence Platform\n"
243
+ "Upload documents, detect sensitive data, match resumes, and summarize text. "
244
+ f"Backend API: `{API_BASE}`"
245
+ )
246
+ gr.Markdown(check_api_health())
247
+
248
+ with gr.Tab("PDF regex annotate"):
249
+ with gr.Row():
250
+ annotate_file = gr.File(label="PDF upload", file_types=[".pdf"])
251
+ annotate_pattern = gr.Textbox(label="Regex pattern", placeholder="CONFIDENTIAL")
252
+ annotate_action = gr.Dropdown(action_choices, value="Highlight", label="Action")
253
+ annotate_btn = gr.Button("Annotate PDF")
254
+ annotate_output = gr.File(label="Annotated PDF")
255
+ annotate_status = gr.Textbox(label="Status")
256
+
257
+ annotate_btn.click(
258
+ annotate_pdf_ui,
259
+ inputs=[annotate_file, annotate_pattern, annotate_action],
260
+ outputs=[annotate_output, annotate_status],
261
+ )
262
+
263
+ with gr.Tab("Sensitive PDF (OCR + Presidio)"):
264
+ gr.Markdown(
265
+ "For scanned PDFs, EasyOCR extracts text and Presidio highlights PII. "
266
+ "Leave entities blank to use the default preset."
267
+ )
268
+ with gr.Row():
269
+ sensitive_file = gr.File(label="PDF upload", file_types=[".pdf"])
270
+ sensitive_action = gr.Dropdown(action_choices, value="Highlight", label="Action")
271
+ sensitive_entities = gr.Textbox(
272
+ label="Presidio entities (comma-separated, optional)",
273
+ placeholder="EMAIL_ADDRESS,PHONE_NUMBER,US_SSN,CREDIT_CARD,PERSON",
274
+ )
275
+ with gr.Row():
276
+ sensitive_force_ocr = gr.Checkbox(label="Force OCR on all pages", value=False)
277
+ sensitive_text_layer = gr.Checkbox(label="Add searchable text layer", value=True)
278
+ sensitive_btn = gr.Button("Detect and annotate sensitive data")
279
+ sensitive_output = gr.File(label="Processed PDF")
280
+ sensitive_report = gr.Textbox(label="Findings report", lines=12)
281
+
282
+ sensitive_btn.click(
283
+ detect_sensitive_ui,
284
+ inputs=[
285
+ sensitive_file,
286
+ sensitive_action,
287
+ sensitive_entities,
288
+ sensitive_force_ocr,
289
+ sensitive_text_layer,
290
+ ],
291
+ outputs=[sensitive_output, sensitive_report],
292
+ )
293
+
294
+ with gr.Tab("PDF structure (LLM)"):
295
+ gr.Markdown(
296
+ "Convert scanned or unstructured PDFs into a curated digital PDF. "
297
+ "Requires `DOCINTEL_LLM_API_KEY` on the API server (default model: `gpt-4o-mini`). "
298
+ "Get a key: [platform.openai.com/api-keys](https://platform.openai.com/api-keys) "
299
+ "| [setup guide](https://platform.openai.com/docs/quickstart)."
300
+ )
301
+ with gr.Row():
302
+ structure_file = gr.File(label="PDF upload", file_types=[".pdf"])
303
+ structure_mode = gr.Dropdown(
304
+ ["curate", "searchable"],
305
+ value="curate",
306
+ label="Output mode",
307
+ )
308
+ structure_force_ocr = gr.Checkbox(label="Force OCR on all pages", value=False)
309
+ structure_btn = gr.Button("Structure PDF")
310
+ structure_output = gr.File(label="Structured PDF")
311
+ structure_report = gr.Textbox(label="Structure report", lines=8)
312
+
313
+ structure_btn.click(
314
+ structure_pdf_ui,
315
+ inputs=[structure_file, structure_mode, structure_force_ocr],
316
+ outputs=[structure_output, structure_report],
317
+ )
318
+
319
+ with gr.Tab("Resume matching"):
320
+ resume_text = gr.Textbox(label="Resume", lines=8)
321
+ job_text = gr.Textbox(label="Job description", lines=8)
322
+ top_kw = gr.Slider(5, 50, value=15, step=1, label="Top keywords")
323
+ match_btn = gr.Button("Score match")
324
+ match_output = gr.Textbox(label="Match result", lines=12)
325
+ match_btn.click(match_resume_ui, inputs=[resume_text, job_text, top_kw], outputs=match_output)
326
+
327
+ with gr.Tab("Text summarization"):
328
+ source_text = gr.Textbox(label="Source text", lines=10)
329
+ sentence_count = gr.Slider(1, 10, value=3, step=1, label="Sentences")
330
+ summary_btn = gr.Button("Summarize")
331
+ summary_output = gr.Textbox(label="Summary result", lines=10)
332
+ summary_btn.click(
333
+ summarize_text_ui,
334
+ inputs=[source_text, sentence_count],
335
+ outputs=summary_output,
336
+ )
337
+
338
+ return demo
339
+
340
+
341
+ def launch_ui() -> None:
342
+ demo = build_ui()
343
+ demo.launch(server_name=GRADIO_HOST, server_port=GRADIO_PORT, share=False)
344
+
345
+
346
+ if __name__ == "__main__":
347
+ launch_ui()
docintel/wsgi.py ADDED
@@ -0,0 +1,5 @@
1
+ """WSGI entry point for production servers."""
2
+
3
+ from docintel.app import create_app
4
+
5
+ app = create_app()