content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from functools import partial
|
|
3
|
+
|
|
4
|
+
from docx import Document # type: ignore
|
|
5
|
+
from openpyxl import load_workbook # type: ignore
|
|
6
|
+
from pptx import Presentation # type: ignore
|
|
7
|
+
|
|
8
|
+
from content_core.common import ProcessSourceState
|
|
9
|
+
from content_core.logging import logger
|
|
10
|
+
|
|
11
|
+
SUPPORTED_OFFICE_TYPES = [
|
|
12
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
13
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
14
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def extract_docx_content_detailed(file_path):
|
|
19
|
+
"""Extract content from DOCX file"""
|
|
20
|
+
|
|
21
|
+
def _extract():
|
|
22
|
+
try:
|
|
23
|
+
doc = Document(file_path)
|
|
24
|
+
content = []
|
|
25
|
+
|
|
26
|
+
for paragraph in doc.paragraphs:
|
|
27
|
+
if not paragraph.text.strip():
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
style = paragraph.style.name if paragraph.style else "Normal"
|
|
31
|
+
text = paragraph.text.strip()
|
|
32
|
+
|
|
33
|
+
# Get paragraph formatting
|
|
34
|
+
p_format = paragraph.paragraph_format
|
|
35
|
+
indent = p_format.left_indent or 0
|
|
36
|
+
|
|
37
|
+
# Convert indent to spaces (1 level = 4 spaces)
|
|
38
|
+
indent_level = 0
|
|
39
|
+
if hasattr(indent, "pt"):
|
|
40
|
+
indent_level = int(indent.pt / 72) # 72 points = 1 inch
|
|
41
|
+
indent_spaces = " " * (indent_level * 4)
|
|
42
|
+
|
|
43
|
+
# Handle different types of formatting
|
|
44
|
+
if "Heading" in style:
|
|
45
|
+
level = style[-1] if style[-1].isdigit() else "1"
|
|
46
|
+
heading_marks = "#" * int(level)
|
|
47
|
+
content.append(f"\n{heading_marks} {text}\n")
|
|
48
|
+
|
|
49
|
+
# Handle bullet points
|
|
50
|
+
elif (
|
|
51
|
+
paragraph.style
|
|
52
|
+
and hasattr(paragraph.style, "name")
|
|
53
|
+
and paragraph.style.name.startswith("List")
|
|
54
|
+
):
|
|
55
|
+
# Numbered list
|
|
56
|
+
if (
|
|
57
|
+
hasattr(paragraph._p, "pPr")
|
|
58
|
+
and paragraph._p.pPr is not None
|
|
59
|
+
and hasattr(paragraph._p.pPr, "numPr")
|
|
60
|
+
and paragraph._p.pPr.numPr is not None
|
|
61
|
+
):
|
|
62
|
+
# Try to get the actual number
|
|
63
|
+
try:
|
|
64
|
+
if (
|
|
65
|
+
hasattr(paragraph._p.pPr.numPr, "numId")
|
|
66
|
+
and paragraph._p.pPr.numPr.numId is not None
|
|
67
|
+
and hasattr(paragraph._p.pPr.numPr.numId, "val")
|
|
68
|
+
):
|
|
69
|
+
number = paragraph._p.pPr.numPr.numId.val
|
|
70
|
+
content.append(f"{indent_spaces}{number}. {text}")
|
|
71
|
+
else:
|
|
72
|
+
content.append(f"{indent_spaces}1. {text}")
|
|
73
|
+
except Exception:
|
|
74
|
+
content.append(f"{indent_spaces}1. {text}")
|
|
75
|
+
# Bullet list
|
|
76
|
+
else:
|
|
77
|
+
content.append(f"{indent_spaces}* {text}")
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
# Handle text formatting
|
|
81
|
+
formatted_text = []
|
|
82
|
+
for run in paragraph.runs:
|
|
83
|
+
if run.bold:
|
|
84
|
+
formatted_text.append(f"**{run.text}**")
|
|
85
|
+
elif run.italic:
|
|
86
|
+
formatted_text.append(f"*{run.text}*")
|
|
87
|
+
else:
|
|
88
|
+
formatted_text.append(run.text)
|
|
89
|
+
|
|
90
|
+
content.append(f"{indent_spaces}{''.join(formatted_text)}")
|
|
91
|
+
|
|
92
|
+
return "\n\n".join(content)
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Failed to extract DOCX content: {e}")
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
return await asyncio.get_event_loop().run_in_executor(None, _extract)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def get_docx_info(file_path):
|
|
102
|
+
"""Get DOCX metadata and content"""
|
|
103
|
+
|
|
104
|
+
async def _get_info():
|
|
105
|
+
try:
|
|
106
|
+
doc = Document(file_path)
|
|
107
|
+
|
|
108
|
+
# Extract core properties if available
|
|
109
|
+
core_props = {
|
|
110
|
+
"author": doc.core_properties.author,
|
|
111
|
+
"created": doc.core_properties.created,
|
|
112
|
+
"modified": doc.core_properties.modified,
|
|
113
|
+
"title": doc.core_properties.title,
|
|
114
|
+
"subject": doc.core_properties.subject,
|
|
115
|
+
"keywords": doc.core_properties.keywords,
|
|
116
|
+
"category": doc.core_properties.category,
|
|
117
|
+
"comments": doc.core_properties.comments,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Get document content
|
|
121
|
+
content = await extract_docx_content_detailed(file_path)
|
|
122
|
+
|
|
123
|
+
# Get document statistics
|
|
124
|
+
stats = {
|
|
125
|
+
"paragraph_count": len(doc.paragraphs),
|
|
126
|
+
"word_count": sum(
|
|
127
|
+
len(p.text.split()) for p in doc.paragraphs if p.text.strip()
|
|
128
|
+
),
|
|
129
|
+
"character_count": sum(
|
|
130
|
+
len(p.text) for p in doc.paragraphs if p.text.strip()
|
|
131
|
+
),
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return {"metadata": core_props, "content": content, "statistics": stats}
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Failed to get DOCX info: {e}")
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
return await _get_info()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
async def extract_pptx_content(file_path):
|
|
144
|
+
"""Extract content from PPTX file"""
|
|
145
|
+
|
|
146
|
+
def _extract():
|
|
147
|
+
try:
|
|
148
|
+
prs = Presentation(file_path)
|
|
149
|
+
content = []
|
|
150
|
+
|
|
151
|
+
for slide_number, slide in enumerate(prs.slides, 1):
|
|
152
|
+
content.append(f"\n# Slide {slide_number}\n")
|
|
153
|
+
|
|
154
|
+
# Extract title
|
|
155
|
+
if slide.shapes.title:
|
|
156
|
+
content.append(f"## {slide.shapes.title.text}\n")
|
|
157
|
+
|
|
158
|
+
# Extract text from all shapes
|
|
159
|
+
for shape in slide.shapes:
|
|
160
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
161
|
+
if (
|
|
162
|
+
shape != slide.shapes.title
|
|
163
|
+
): # Skip title as it's already added
|
|
164
|
+
content.append(shape.text.strip())
|
|
165
|
+
|
|
166
|
+
return "\n\n".join(content)
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.error(f"Failed to extract PPTX content: {e}")
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
return await asyncio.get_event_loop().run_in_executor(None, _extract)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
async def extract_xlsx_content(file_path, max_rows=10000, max_cols=100):
|
|
176
|
+
"""Extract content from XLSX file"""
|
|
177
|
+
|
|
178
|
+
def _extract():
|
|
179
|
+
try:
|
|
180
|
+
wb = load_workbook(file_path, data_only=True)
|
|
181
|
+
content = []
|
|
182
|
+
|
|
183
|
+
for sheet in wb.sheetnames:
|
|
184
|
+
ws = wb[sheet]
|
|
185
|
+
content.append(f"\n# Sheet: {sheet}\n")
|
|
186
|
+
|
|
187
|
+
# Get the maximum row and column with data
|
|
188
|
+
max_row = min(ws.max_row, max_rows)
|
|
189
|
+
max_col = min(ws.max_column, max_cols)
|
|
190
|
+
|
|
191
|
+
# Create markdown table header
|
|
192
|
+
headers = []
|
|
193
|
+
for col in range(1, max_col + 1):
|
|
194
|
+
cell_value = ws.cell(row=1, column=col).value
|
|
195
|
+
headers.append(str(cell_value) if cell_value is not None else "")
|
|
196
|
+
|
|
197
|
+
content.append("| " + " | ".join(headers) + " |")
|
|
198
|
+
content.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
|
199
|
+
|
|
200
|
+
# Add table content
|
|
201
|
+
for row in range(2, max_row + 1):
|
|
202
|
+
row_data = []
|
|
203
|
+
for col in range(1, max_col + 1):
|
|
204
|
+
cell_value = ws.cell(row=row, column=col).value
|
|
205
|
+
row_data.append(
|
|
206
|
+
str(cell_value) if cell_value is not None else ""
|
|
207
|
+
)
|
|
208
|
+
content.append("| " + " | ".join(row_data) + " |")
|
|
209
|
+
|
|
210
|
+
return "\n".join(content)
|
|
211
|
+
|
|
212
|
+
except Exception as e:
|
|
213
|
+
logger.error(f"Failed to extract XLSX content: {e}")
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
return await asyncio.get_event_loop().run_in_executor(None, partial(_extract))
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
async def get_pptx_info(file_path):
|
|
220
|
+
"""Get PPTX metadata and content"""
|
|
221
|
+
|
|
222
|
+
def _get_pptx_metadata_sync(file_path):
|
|
223
|
+
"""Synchronous helper to extract metadata using python-pptx."""
|
|
224
|
+
try:
|
|
225
|
+
prs = Presentation(file_path)
|
|
226
|
+
props = {
|
|
227
|
+
"slide_count": len(prs.slides),
|
|
228
|
+
"title": "", # PowerPoint doesn't have built-in metadata like Word
|
|
229
|
+
}
|
|
230
|
+
stats = {
|
|
231
|
+
"slide_count": len(prs.slides),
|
|
232
|
+
"shape_count": sum(len(slide.shapes) for slide in prs.slides),
|
|
233
|
+
"text_frame_count": sum(
|
|
234
|
+
sum(1 for shape in slide.shapes if hasattr(shape, "text"))
|
|
235
|
+
for slide in prs.slides
|
|
236
|
+
),
|
|
237
|
+
}
|
|
238
|
+
return {"metadata": props, "statistics": stats}
|
|
239
|
+
except Exception as e:
|
|
240
|
+
logger.error(f"Failed to get PPTX metadata: {e}")
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
# Run blocking python-pptx operations in executor
|
|
245
|
+
metadata_info = await asyncio.get_event_loop().run_in_executor(
|
|
246
|
+
None, _get_pptx_metadata_sync, file_path
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Await the async content extraction directly
|
|
250
|
+
content = await extract_pptx_content(file_path)
|
|
251
|
+
|
|
252
|
+
if metadata_info:
|
|
253
|
+
# Combine results
|
|
254
|
+
return {**metadata_info, "content": content}
|
|
255
|
+
else:
|
|
256
|
+
# Fallback if metadata extraction failed
|
|
257
|
+
return {"metadata": {}, "statistics": {}, "content": content}
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.error(f"Failed to get PPTX info: {e}")
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
async def get_xlsx_info(file_path):
|
|
265
|
+
"""Get XLSX metadata and content"""
|
|
266
|
+
|
|
267
|
+
async def _get_info():
|
|
268
|
+
try:
|
|
269
|
+
wb = load_workbook(file_path, data_only=True)
|
|
270
|
+
|
|
271
|
+
# Extract basic properties
|
|
272
|
+
props = {
|
|
273
|
+
"sheet_count": len(wb.sheetnames),
|
|
274
|
+
"sheets": wb.sheetnames,
|
|
275
|
+
"title": wb.properties.title,
|
|
276
|
+
"creator": wb.properties.creator,
|
|
277
|
+
"created": wb.properties.created,
|
|
278
|
+
"modified": wb.properties.modified,
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
# Get document content
|
|
282
|
+
content = await extract_xlsx_content(file_path)
|
|
283
|
+
|
|
284
|
+
# Get workbook statistics
|
|
285
|
+
stats = {
|
|
286
|
+
"sheet_count": len(wb.sheetnames),
|
|
287
|
+
"total_rows": sum(sheet.max_row for sheet in wb.worksheets),
|
|
288
|
+
"total_columns": sum(sheet.max_column for sheet in wb.worksheets),
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return {"metadata": props, "content": content, "statistics": stats}
|
|
292
|
+
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.error(f"Failed to get XLSX info: {e}")
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
return await _get_info()
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
async def extract_office_content(state: ProcessSourceState):
|
|
301
|
+
"""Universal function to extract content from Office files"""
|
|
302
|
+
assert state.file_path, "No file path provided"
|
|
303
|
+
assert state.identified_type in SUPPORTED_OFFICE_TYPES, "Unsupported File Type"
|
|
304
|
+
file_path = state.file_path
|
|
305
|
+
doc_type = state.identified_type
|
|
306
|
+
|
|
307
|
+
if (
|
|
308
|
+
doc_type
|
|
309
|
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
310
|
+
):
|
|
311
|
+
logger.debug("Extracting content from DOCX file")
|
|
312
|
+
content = await extract_docx_content_detailed(file_path)
|
|
313
|
+
info = await get_docx_info(file_path)
|
|
314
|
+
elif (
|
|
315
|
+
doc_type
|
|
316
|
+
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
317
|
+
):
|
|
318
|
+
logger.debug("Extracting content from PPTX file")
|
|
319
|
+
content = await extract_pptx_content(file_path)
|
|
320
|
+
info = await get_pptx_info(file_path)
|
|
321
|
+
elif (
|
|
322
|
+
doc_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
323
|
+
):
|
|
324
|
+
logger.debug("Extracting content from XLSX file")
|
|
325
|
+
content = await extract_xlsx_content(file_path)
|
|
326
|
+
info = await get_xlsx_info(file_path)
|
|
327
|
+
else:
|
|
328
|
+
raise Exception(f"Unsupported file format: {doc_type}")
|
|
329
|
+
|
|
330
|
+
del info["content"]
|
|
331
|
+
return {"content": content, "metadata": info}
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import re
|
|
3
|
+
import unicodedata
|
|
4
|
+
|
|
5
|
+
import fitz # type: ignore
|
|
6
|
+
|
|
7
|
+
from content_core.common import ProcessSourceState
|
|
8
|
+
from content_core.config import CONFIG
|
|
9
|
+
from content_core.logging import logger
|
|
10
|
+
|
|
11
|
+
def count_formula_placeholders(text):
|
|
12
|
+
"""
|
|
13
|
+
Count the number of formula placeholders in extracted text.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
text (str): Extracted text content
|
|
17
|
+
Returns:
|
|
18
|
+
int: Number of formula placeholders found
|
|
19
|
+
"""
|
|
20
|
+
if not text:
|
|
21
|
+
return 0
|
|
22
|
+
return text.count('<!-- formula-not-decoded -->')
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def extract_page_with_ocr(page, page_num):
|
|
26
|
+
"""
|
|
27
|
+
Extract text from a page using OCR (Tesseract).
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
page: PyMuPDF page object
|
|
31
|
+
page_num (int): Page number for logging
|
|
32
|
+
Returns:
|
|
33
|
+
str: OCR-extracted text or None if OCR fails
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
logger.debug(f"Attempting OCR extraction for page {page_num}")
|
|
37
|
+
# Create TextPage using OCR
|
|
38
|
+
textpage = page.get_textpage_ocr()
|
|
39
|
+
if textpage:
|
|
40
|
+
# Extract text from the OCR TextPage
|
|
41
|
+
ocr_text = textpage.extractText()
|
|
42
|
+
logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
|
|
43
|
+
return ocr_text
|
|
44
|
+
else:
|
|
45
|
+
logger.warning(f"OCR TextPage creation failed for page {page_num}")
|
|
46
|
+
return None
|
|
47
|
+
except (ImportError, RuntimeError, OSError) as e:
|
|
48
|
+
# Common errors: Tesseract not installed, OCR failure, file access issues
|
|
49
|
+
logger.debug(f"OCR extraction failed for page {page_num}: {e}")
|
|
50
|
+
return None
|
|
51
|
+
except Exception as e:
|
|
52
|
+
# Unexpected errors - log as warning for debugging
|
|
53
|
+
logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def convert_table_to_markdown(table):
|
|
58
|
+
"""
|
|
59
|
+
Convert a PyMuPDF table to markdown format.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
table: Table data from PyMuPDF (list of lists)
|
|
63
|
+
Returns:
|
|
64
|
+
str: Markdown-formatted table
|
|
65
|
+
"""
|
|
66
|
+
if not table or not table[0]:
|
|
67
|
+
return ""
|
|
68
|
+
|
|
69
|
+
# Build markdown table
|
|
70
|
+
markdown_lines = []
|
|
71
|
+
|
|
72
|
+
# Header row
|
|
73
|
+
header = table[0]
|
|
74
|
+
header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
|
|
75
|
+
markdown_lines.append(header_row)
|
|
76
|
+
|
|
77
|
+
# Separator row
|
|
78
|
+
separator = "|" + "|".join([" --- " for _ in header]) + "|"
|
|
79
|
+
markdown_lines.append(separator)
|
|
80
|
+
|
|
81
|
+
# Data rows
|
|
82
|
+
for row in table[1:]:
|
|
83
|
+
if row: # Skip empty rows
|
|
84
|
+
row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
|
|
85
|
+
markdown_lines.append(row_text)
|
|
86
|
+
|
|
87
|
+
return "\n".join(markdown_lines) + "\n"
|
|
88
|
+
|
|
89
|
+
# Configuration constants
|
|
90
|
+
DEFAULT_FORMULA_THRESHOLD = 3
|
|
91
|
+
DEFAULT_OCR_FALLBACK = True
|
|
92
|
+
|
|
93
|
+
SUPPORTED_FITZ_TYPES = [
|
|
94
|
+
"application/pdf",
|
|
95
|
+
"application/epub+zip",
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def clean_pdf_text(text):
|
|
100
|
+
"""
|
|
101
|
+
Clean text extracted from PDFs with enhanced space handling.
|
|
102
|
+
Preserves special characters like (, ), %, = that are valid in code/math.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
text (str): The raw text extracted from a PDF
|
|
106
|
+
Returns:
|
|
107
|
+
str: Cleaned text with minimal necessary spacing
|
|
108
|
+
"""
|
|
109
|
+
if not text:
|
|
110
|
+
return text
|
|
111
|
+
|
|
112
|
+
# Step 1: Normalize Unicode characters
|
|
113
|
+
text = unicodedata.normalize("NFKC", text)
|
|
114
|
+
|
|
115
|
+
# Step 2: Replace common PDF artifacts
|
|
116
|
+
replacements = {
|
|
117
|
+
# Common ligatures
|
|
118
|
+
"fi": "fi",
|
|
119
|
+
"fl": "fl",
|
|
120
|
+
"ff": "ff",
|
|
121
|
+
"ffi": "ffi",
|
|
122
|
+
"ffl": "ffl",
|
|
123
|
+
# Quotation marks and apostrophes
|
|
124
|
+
""": "'",
|
|
125
|
+
""": "'",
|
|
126
|
+
'"': '"',
|
|
127
|
+
"′": "'",
|
|
128
|
+
"‚": ",",
|
|
129
|
+
"„": '"',
|
|
130
|
+
# Dashes and hyphens
|
|
131
|
+
"‒": "-",
|
|
132
|
+
"–": "-",
|
|
133
|
+
"—": "-",
|
|
134
|
+
"―": "-",
|
|
135
|
+
# Other common replacements
|
|
136
|
+
"…": "...",
|
|
137
|
+
"•": "*",
|
|
138
|
+
"°": " degrees ",
|
|
139
|
+
"¹": "1",
|
|
140
|
+
"²": "2",
|
|
141
|
+
"³": "3",
|
|
142
|
+
"©": "(c)",
|
|
143
|
+
"®": "(R)",
|
|
144
|
+
"™": "(TM)",
|
|
145
|
+
}
|
|
146
|
+
for old, new in replacements.items():
|
|
147
|
+
text = text.replace(old, new)
|
|
148
|
+
|
|
149
|
+
# Step 3: Clean control characters while preserving essential whitespace and special chars
|
|
150
|
+
text = "".join(
|
|
151
|
+
char
|
|
152
|
+
for char in text
|
|
153
|
+
if unicodedata.category(char)[0] != "C"
|
|
154
|
+
or char in "\n\t "
|
|
155
|
+
or char in "()%=[]{}#$@!?.,;:+-*/^<>&|~"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Step 4: Enhanced space cleaning
|
|
159
|
+
text = re.sub(r"[ \t]+", " ", text) # Consolidate horizontal whitespace
|
|
160
|
+
text = re.sub(r" +\n", "\n", text) # Remove spaces before newlines
|
|
161
|
+
text = re.sub(r"\n +", "\n", text) # Remove spaces after newlines
|
|
162
|
+
text = re.sub(r"\n\t+", "\n", text) # Remove tabs at start of lines
|
|
163
|
+
text = re.sub(r"\t+\n", "\n", text) # Remove tabs at end of lines
|
|
164
|
+
text = re.sub(r"\t+", " ", text) # Replace tabs with single space
|
|
165
|
+
|
|
166
|
+
# Step 5: Remove empty lines while preserving paragraph structure
|
|
167
|
+
text = re.sub(r"\n{3,}", "\n\n", text) # Max two consecutive newlines
|
|
168
|
+
text = re.sub(r"^\s+", "", text) # Remove leading whitespace
|
|
169
|
+
text = re.sub(r"\s+$", "", text) # Remove trailing whitespace
|
|
170
|
+
|
|
171
|
+
# Step 6: Clean up around punctuation
|
|
172
|
+
text = re.sub(r"\s+([.,;:!?)])", r"\1", text) # Remove spaces before punctuation
|
|
173
|
+
text = re.sub(r"(\()\s+", r"\1", text) # Remove spaces after opening parenthesis
|
|
174
|
+
text = re.sub(
|
|
175
|
+
r"\s+([.,])\s+", r"\1 ", text
|
|
176
|
+
) # Ensure single space after periods and commas
|
|
177
|
+
|
|
178
|
+
# Step 7: Remove zero-width and invisible characters
|
|
179
|
+
text = re.sub(r"[\u200b\u200c\u200d\ufeff\u200e\u200f]", "", text)
|
|
180
|
+
|
|
181
|
+
# Step 8: Fix hyphenation and line breaks
|
|
182
|
+
text = re.sub(
|
|
183
|
+
r"(?<=\w)-\s*\n\s*(?=\w)", "", text
|
|
184
|
+
) # Remove hyphenation at line breaks
|
|
185
|
+
|
|
186
|
+
return text.strip()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
async def _extract_text_from_pdf(pdf_path):
|
|
192
|
+
"""Extract text from PDF asynchronously with table detection"""
|
|
193
|
+
|
|
194
|
+
def _extract():
|
|
195
|
+
doc = fitz.open(pdf_path)
|
|
196
|
+
try:
|
|
197
|
+
full_text = []
|
|
198
|
+
logger.debug(f"Found {len(doc)} pages in PDF")
|
|
199
|
+
|
|
200
|
+
# Use quality improvement flags for better text extraction
|
|
201
|
+
extraction_flags = (
|
|
202
|
+
fitz.TEXT_PRESERVE_LIGATURES | # Better character rendering
|
|
203
|
+
fitz.TEXT_PRESERVE_WHITESPACE | # Better spacing preservation
|
|
204
|
+
fitz.TEXT_PRESERVE_IMAGES # Better image-text integration
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Get OCR configuration
|
|
208
|
+
ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
|
|
209
|
+
enable_ocr = ocr_config.get('enable_formula_ocr', False)
|
|
210
|
+
formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
|
|
211
|
+
ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
|
|
212
|
+
|
|
213
|
+
for page_num, page in enumerate(doc):
|
|
214
|
+
# Extract regular text with quality flags
|
|
215
|
+
standard_text = page.get_text(flags=extraction_flags)
|
|
216
|
+
|
|
217
|
+
# Check if we should try OCR for this page
|
|
218
|
+
formula_count = count_formula_placeholders(standard_text)
|
|
219
|
+
use_ocr = (enable_ocr and
|
|
220
|
+
formula_count >= formula_threshold and
|
|
221
|
+
formula_count > 0)
|
|
222
|
+
|
|
223
|
+
if use_ocr:
|
|
224
|
+
logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
|
|
225
|
+
ocr_text = extract_page_with_ocr(page, page_num + 1)
|
|
226
|
+
|
|
227
|
+
if ocr_text and ocr_fallback:
|
|
228
|
+
# Use OCR text but preserve table extraction from standard text
|
|
229
|
+
page_text = ocr_text
|
|
230
|
+
logger.debug(f"Using OCR text for page {page_num + 1}")
|
|
231
|
+
else:
|
|
232
|
+
# OCR failed, use standard text
|
|
233
|
+
page_text = standard_text
|
|
234
|
+
if not ocr_text:
|
|
235
|
+
logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
|
|
236
|
+
else:
|
|
237
|
+
page_text = standard_text
|
|
238
|
+
|
|
239
|
+
# Try to find and extract tables (regardless of OCR)
|
|
240
|
+
try:
|
|
241
|
+
tables = page.find_tables()
|
|
242
|
+
if tables:
|
|
243
|
+
logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
|
|
244
|
+
|
|
245
|
+
# For each table found, convert to markdown and append
|
|
246
|
+
for table_num, table in enumerate(tables):
|
|
247
|
+
# Extract table data
|
|
248
|
+
table_data = table.extract()
|
|
249
|
+
# Validate table has actual content (not just empty rows/cells)
|
|
250
|
+
if table_data and len(table_data) > 0 and any(
|
|
251
|
+
any(str(cell).strip() for cell in row if cell) for row in table_data if row
|
|
252
|
+
):
|
|
253
|
+
# Add a marker before the table
|
|
254
|
+
page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
|
|
255
|
+
# Convert to markdown
|
|
256
|
+
markdown_table = convert_table_to_markdown(table_data)
|
|
257
|
+
page_text += markdown_table + "\n"
|
|
258
|
+
except Exception as e:
|
|
259
|
+
# If table extraction fails, continue with regular text
|
|
260
|
+
logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
|
|
261
|
+
|
|
262
|
+
full_text.append(page_text)
|
|
263
|
+
|
|
264
|
+
# Join all pages and clean
|
|
265
|
+
combined_text = "".join(full_text)
|
|
266
|
+
return clean_pdf_text(combined_text)
|
|
267
|
+
finally:
|
|
268
|
+
doc.close()
|
|
269
|
+
|
|
270
|
+
# Run CPU-bound PDF processing in a thread pool
|
|
271
|
+
return await asyncio.get_event_loop().run_in_executor(None, _extract)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
async def extract_pdf(state: ProcessSourceState):
|
|
275
|
+
"""
|
|
276
|
+
Parse the PDF file and extract its content asynchronously.
|
|
277
|
+
"""
|
|
278
|
+
return_dict = {}
|
|
279
|
+
assert state.file_path, "No file path provided"
|
|
280
|
+
assert state.identified_type in SUPPORTED_FITZ_TYPES, "Unsupported File Type"
|
|
281
|
+
|
|
282
|
+
if state.file_path is not None and state.identified_type in SUPPORTED_FITZ_TYPES:
|
|
283
|
+
file_path = state.file_path
|
|
284
|
+
try:
|
|
285
|
+
text = await _extract_text_from_pdf(file_path)
|
|
286
|
+
return_dict["content"] = text
|
|
287
|
+
except FileNotFoundError:
|
|
288
|
+
raise FileNotFoundError(f"File not found at {file_path}")
|
|
289
|
+
except Exception as e:
|
|
290
|
+
raise Exception(f"An error occurred: {e}")
|
|
291
|
+
|
|
292
|
+
return return_dict
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from content_core.common import ProcessSourceState
|
|
4
|
+
from content_core.logging import logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def extract_txt(state: ProcessSourceState):
|
|
8
|
+
"""
|
|
9
|
+
Parse the text file and extract its content asynchronously.
|
|
10
|
+
"""
|
|
11
|
+
return_dict = {}
|
|
12
|
+
if state.file_path is not None and state.identified_type == "text/plain":
|
|
13
|
+
logger.debug(f"Extracting text from {state.file_path}")
|
|
14
|
+
file_path = state.file_path
|
|
15
|
+
|
|
16
|
+
if file_path is not None:
|
|
17
|
+
try:
|
|
18
|
+
|
|
19
|
+
def _read_file():
|
|
20
|
+
with open(file_path, "r", encoding="utf-8") as file:
|
|
21
|
+
return file.read()
|
|
22
|
+
|
|
23
|
+
# Run file I/O in thread pool
|
|
24
|
+
content = await asyncio.get_event_loop().run_in_executor(
|
|
25
|
+
None, _read_file
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
logger.debug(f"Extracted: {content[:100]}")
|
|
29
|
+
return_dict["content"] = content
|
|
30
|
+
|
|
31
|
+
except FileNotFoundError:
|
|
32
|
+
raise FileNotFoundError(f"File not found at {file_path}")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
raise Exception(f"An error occurred: {e}")
|
|
35
|
+
|
|
36
|
+
return return_dict
|