content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,331 @@
1
+ import asyncio
2
+ from functools import partial
3
+
4
+ from docx import Document # type: ignore
5
+ from openpyxl import load_workbook # type: ignore
6
+ from pptx import Presentation # type: ignore
7
+
8
+ from content_core.common import ProcessSourceState
9
+ from content_core.logging import logger
10
+
11
+ SUPPORTED_OFFICE_TYPES = [
12
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
13
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
14
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
15
+ ]
16
+
17
+
18
+ async def extract_docx_content_detailed(file_path):
19
+ """Extract content from DOCX file"""
20
+
21
+ def _extract():
22
+ try:
23
+ doc = Document(file_path)
24
+ content = []
25
+
26
+ for paragraph in doc.paragraphs:
27
+ if not paragraph.text.strip():
28
+ continue
29
+
30
+ style = paragraph.style.name if paragraph.style else "Normal"
31
+ text = paragraph.text.strip()
32
+
33
+ # Get paragraph formatting
34
+ p_format = paragraph.paragraph_format
35
+ indent = p_format.left_indent or 0
36
+
37
+ # Convert indent to spaces (1 level = 4 spaces)
38
+ indent_level = 0
39
+ if hasattr(indent, "pt"):
40
+ indent_level = int(indent.pt / 72) # 72 points = 1 inch
41
+ indent_spaces = " " * (indent_level * 4)
42
+
43
+ # Handle different types of formatting
44
+ if "Heading" in style:
45
+ level = style[-1] if style[-1].isdigit() else "1"
46
+ heading_marks = "#" * int(level)
47
+ content.append(f"\n{heading_marks} {text}\n")
48
+
49
+ # Handle bullet points
50
+ elif (
51
+ paragraph.style
52
+ and hasattr(paragraph.style, "name")
53
+ and paragraph.style.name.startswith("List")
54
+ ):
55
+ # Numbered list
56
+ if (
57
+ hasattr(paragraph._p, "pPr")
58
+ and paragraph._p.pPr is not None
59
+ and hasattr(paragraph._p.pPr, "numPr")
60
+ and paragraph._p.pPr.numPr is not None
61
+ ):
62
+ # Try to get the actual number
63
+ try:
64
+ if (
65
+ hasattr(paragraph._p.pPr.numPr, "numId")
66
+ and paragraph._p.pPr.numPr.numId is not None
67
+ and hasattr(paragraph._p.pPr.numPr.numId, "val")
68
+ ):
69
+ number = paragraph._p.pPr.numPr.numId.val
70
+ content.append(f"{indent_spaces}{number}. {text}")
71
+ else:
72
+ content.append(f"{indent_spaces}1. {text}")
73
+ except Exception:
74
+ content.append(f"{indent_spaces}1. {text}")
75
+ # Bullet list
76
+ else:
77
+ content.append(f"{indent_spaces}* {text}")
78
+
79
+ else:
80
+ # Handle text formatting
81
+ formatted_text = []
82
+ for run in paragraph.runs:
83
+ if run.bold:
84
+ formatted_text.append(f"**{run.text}**")
85
+ elif run.italic:
86
+ formatted_text.append(f"*{run.text}*")
87
+ else:
88
+ formatted_text.append(run.text)
89
+
90
+ content.append(f"{indent_spaces}{''.join(formatted_text)}")
91
+
92
+ return "\n\n".join(content)
93
+
94
+ except Exception as e:
95
+ logger.error(f"Failed to extract DOCX content: {e}")
96
+ return None
97
+
98
+ return await asyncio.get_event_loop().run_in_executor(None, _extract)
99
+
100
+
101
+ async def get_docx_info(file_path):
102
+ """Get DOCX metadata and content"""
103
+
104
+ async def _get_info():
105
+ try:
106
+ doc = Document(file_path)
107
+
108
+ # Extract core properties if available
109
+ core_props = {
110
+ "author": doc.core_properties.author,
111
+ "created": doc.core_properties.created,
112
+ "modified": doc.core_properties.modified,
113
+ "title": doc.core_properties.title,
114
+ "subject": doc.core_properties.subject,
115
+ "keywords": doc.core_properties.keywords,
116
+ "category": doc.core_properties.category,
117
+ "comments": doc.core_properties.comments,
118
+ }
119
+
120
+ # Get document content
121
+ content = await extract_docx_content_detailed(file_path)
122
+
123
+ # Get document statistics
124
+ stats = {
125
+ "paragraph_count": len(doc.paragraphs),
126
+ "word_count": sum(
127
+ len(p.text.split()) for p in doc.paragraphs if p.text.strip()
128
+ ),
129
+ "character_count": sum(
130
+ len(p.text) for p in doc.paragraphs if p.text.strip()
131
+ ),
132
+ }
133
+
134
+ return {"metadata": core_props, "content": content, "statistics": stats}
135
+
136
+ except Exception as e:
137
+ logger.error(f"Failed to get DOCX info: {e}")
138
+ return None
139
+
140
+ return await _get_info()
141
+
142
+
143
+ async def extract_pptx_content(file_path):
144
+ """Extract content from PPTX file"""
145
+
146
+ def _extract():
147
+ try:
148
+ prs = Presentation(file_path)
149
+ content = []
150
+
151
+ for slide_number, slide in enumerate(prs.slides, 1):
152
+ content.append(f"\n# Slide {slide_number}\n")
153
+
154
+ # Extract title
155
+ if slide.shapes.title:
156
+ content.append(f"## {slide.shapes.title.text}\n")
157
+
158
+ # Extract text from all shapes
159
+ for shape in slide.shapes:
160
+ if hasattr(shape, "text") and shape.text.strip():
161
+ if (
162
+ shape != slide.shapes.title
163
+ ): # Skip title as it's already added
164
+ content.append(shape.text.strip())
165
+
166
+ return "\n\n".join(content)
167
+
168
+ except Exception as e:
169
+ logger.error(f"Failed to extract PPTX content: {e}")
170
+ return None
171
+
172
+ return await asyncio.get_event_loop().run_in_executor(None, _extract)
173
+
174
+
175
+ async def extract_xlsx_content(file_path, max_rows=10000, max_cols=100):
176
+ """Extract content from XLSX file"""
177
+
178
+ def _extract():
179
+ try:
180
+ wb = load_workbook(file_path, data_only=True)
181
+ content = []
182
+
183
+ for sheet in wb.sheetnames:
184
+ ws = wb[sheet]
185
+ content.append(f"\n# Sheet: {sheet}\n")
186
+
187
+ # Get the maximum row and column with data
188
+ max_row = min(ws.max_row, max_rows)
189
+ max_col = min(ws.max_column, max_cols)
190
+
191
+ # Create markdown table header
192
+ headers = []
193
+ for col in range(1, max_col + 1):
194
+ cell_value = ws.cell(row=1, column=col).value
195
+ headers.append(str(cell_value) if cell_value is not None else "")
196
+
197
+ content.append("| " + " | ".join(headers) + " |")
198
+ content.append("| " + " | ".join(["---"] * len(headers)) + " |")
199
+
200
+ # Add table content
201
+ for row in range(2, max_row + 1):
202
+ row_data = []
203
+ for col in range(1, max_col + 1):
204
+ cell_value = ws.cell(row=row, column=col).value
205
+ row_data.append(
206
+ str(cell_value) if cell_value is not None else ""
207
+ )
208
+ content.append("| " + " | ".join(row_data) + " |")
209
+
210
+ return "\n".join(content)
211
+
212
+ except Exception as e:
213
+ logger.error(f"Failed to extract XLSX content: {e}")
214
+ return None
215
+
216
+ return await asyncio.get_event_loop().run_in_executor(None, partial(_extract))
217
+
218
+
219
+ async def get_pptx_info(file_path):
220
+ """Get PPTX metadata and content"""
221
+
222
+ def _get_pptx_metadata_sync(file_path):
223
+ """Synchronous helper to extract metadata using python-pptx."""
224
+ try:
225
+ prs = Presentation(file_path)
226
+ props = {
227
+ "slide_count": len(prs.slides),
228
+ "title": "", # PowerPoint doesn't have built-in metadata like Word
229
+ }
230
+ stats = {
231
+ "slide_count": len(prs.slides),
232
+ "shape_count": sum(len(slide.shapes) for slide in prs.slides),
233
+ "text_frame_count": sum(
234
+ sum(1 for shape in slide.shapes if hasattr(shape, "text"))
235
+ for slide in prs.slides
236
+ ),
237
+ }
238
+ return {"metadata": props, "statistics": stats}
239
+ except Exception as e:
240
+ logger.error(f"Failed to get PPTX metadata: {e}")
241
+ return None
242
+
243
+ try:
244
+ # Run blocking python-pptx operations in executor
245
+ metadata_info = await asyncio.get_event_loop().run_in_executor(
246
+ None, _get_pptx_metadata_sync, file_path
247
+ )
248
+
249
+ # Await the async content extraction directly
250
+ content = await extract_pptx_content(file_path)
251
+
252
+ if metadata_info:
253
+ # Combine results
254
+ return {**metadata_info, "content": content}
255
+ else:
256
+ # Fallback if metadata extraction failed
257
+ return {"metadata": {}, "statistics": {}, "content": content}
258
+
259
+ except Exception as e:
260
+ logger.error(f"Failed to get PPTX info: {e}")
261
+ return None
262
+
263
+
264
+ async def get_xlsx_info(file_path):
265
+ """Get XLSX metadata and content"""
266
+
267
+ async def _get_info():
268
+ try:
269
+ wb = load_workbook(file_path, data_only=True)
270
+
271
+ # Extract basic properties
272
+ props = {
273
+ "sheet_count": len(wb.sheetnames),
274
+ "sheets": wb.sheetnames,
275
+ "title": wb.properties.title,
276
+ "creator": wb.properties.creator,
277
+ "created": wb.properties.created,
278
+ "modified": wb.properties.modified,
279
+ }
280
+
281
+ # Get document content
282
+ content = await extract_xlsx_content(file_path)
283
+
284
+ # Get workbook statistics
285
+ stats = {
286
+ "sheet_count": len(wb.sheetnames),
287
+ "total_rows": sum(sheet.max_row for sheet in wb.worksheets),
288
+ "total_columns": sum(sheet.max_column for sheet in wb.worksheets),
289
+ }
290
+
291
+ return {"metadata": props, "content": content, "statistics": stats}
292
+
293
+ except Exception as e:
294
+ logger.error(f"Failed to get XLSX info: {e}")
295
+ return None
296
+
297
+ return await _get_info()
298
+
299
+
300
+ async def extract_office_content(state: ProcessSourceState):
301
+ """Universal function to extract content from Office files"""
302
+ assert state.file_path, "No file path provided"
303
+ assert state.identified_type in SUPPORTED_OFFICE_TYPES, "Unsupported File Type"
304
+ file_path = state.file_path
305
+ doc_type = state.identified_type
306
+
307
+ if (
308
+ doc_type
309
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
310
+ ):
311
+ logger.debug("Extracting content from DOCX file")
312
+ content = await extract_docx_content_detailed(file_path)
313
+ info = await get_docx_info(file_path)
314
+ elif (
315
+ doc_type
316
+ == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
317
+ ):
318
+ logger.debug("Extracting content from PPTX file")
319
+ content = await extract_pptx_content(file_path)
320
+ info = await get_pptx_info(file_path)
321
+ elif (
322
+ doc_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
323
+ ):
324
+ logger.debug("Extracting content from XLSX file")
325
+ content = await extract_xlsx_content(file_path)
326
+ info = await get_xlsx_info(file_path)
327
+ else:
328
+ raise Exception(f"Unsupported file format: {doc_type}")
329
+
330
+ del info["content"]
331
+ return {"content": content, "metadata": info}
@@ -0,0 +1,292 @@
1
+ import asyncio
2
+ import re
3
+ import unicodedata
4
+
5
+ import fitz # type: ignore
6
+
7
+ from content_core.common import ProcessSourceState
8
+ from content_core.config import CONFIG
9
+ from content_core.logging import logger
10
+
11
+ def count_formula_placeholders(text):
12
+ """
13
+ Count the number of formula placeholders in extracted text.
14
+
15
+ Args:
16
+ text (str): Extracted text content
17
+ Returns:
18
+ int: Number of formula placeholders found
19
+ """
20
+ if not text:
21
+ return 0
22
+ return text.count('<!-- formula-not-decoded -->')
23
+
24
+
25
+ def extract_page_with_ocr(page, page_num):
26
+ """
27
+ Extract text from a page using OCR (Tesseract).
28
+
29
+ Args:
30
+ page: PyMuPDF page object
31
+ page_num (int): Page number for logging
32
+ Returns:
33
+ str: OCR-extracted text or None if OCR fails
34
+ """
35
+ try:
36
+ logger.debug(f"Attempting OCR extraction for page {page_num}")
37
+ # Create TextPage using OCR
38
+ textpage = page.get_textpage_ocr()
39
+ if textpage:
40
+ # Extract text from the OCR TextPage
41
+ ocr_text = textpage.extractText()
42
+ logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
43
+ return ocr_text
44
+ else:
45
+ logger.warning(f"OCR TextPage creation failed for page {page_num}")
46
+ return None
47
+ except (ImportError, RuntimeError, OSError) as e:
48
+ # Common errors: Tesseract not installed, OCR failure, file access issues
49
+ logger.debug(f"OCR extraction failed for page {page_num}: {e}")
50
+ return None
51
+ except Exception as e:
52
+ # Unexpected errors - log as warning for debugging
53
+ logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
54
+ return None
55
+
56
+
57
+ def convert_table_to_markdown(table):
58
+ """
59
+ Convert a PyMuPDF table to markdown format.
60
+
61
+ Args:
62
+ table: Table data from PyMuPDF (list of lists)
63
+ Returns:
64
+ str: Markdown-formatted table
65
+ """
66
+ if not table or not table[0]:
67
+ return ""
68
+
69
+ # Build markdown table
70
+ markdown_lines = []
71
+
72
+ # Header row
73
+ header = table[0]
74
+ header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
75
+ markdown_lines.append(header_row)
76
+
77
+ # Separator row
78
+ separator = "|" + "|".join([" --- " for _ in header]) + "|"
79
+ markdown_lines.append(separator)
80
+
81
+ # Data rows
82
+ for row in table[1:]:
83
+ if row: # Skip empty rows
84
+ row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
85
+ markdown_lines.append(row_text)
86
+
87
+ return "\n".join(markdown_lines) + "\n"
88
+
89
+ # Configuration constants
90
+ DEFAULT_FORMULA_THRESHOLD = 3
91
+ DEFAULT_OCR_FALLBACK = True
92
+
93
+ SUPPORTED_FITZ_TYPES = [
94
+ "application/pdf",
95
+ "application/epub+zip",
96
+ ]
97
+
98
+
99
+ def clean_pdf_text(text):
100
+ """
101
+ Clean text extracted from PDFs with enhanced space handling.
102
+ Preserves special characters like (, ), %, = that are valid in code/math.
103
+
104
+ Args:
105
+ text (str): The raw text extracted from a PDF
106
+ Returns:
107
+ str: Cleaned text with minimal necessary spacing
108
+ """
109
+ if not text:
110
+ return text
111
+
112
+ # Step 1: Normalize Unicode characters
113
+ text = unicodedata.normalize("NFKC", text)
114
+
115
+ # Step 2: Replace common PDF artifacts
116
+ replacements = {
117
+ # Common ligatures
118
+ "fi": "fi",
119
+ "fl": "fl",
120
+ "ff": "ff",
121
+ "ffi": "ffi",
122
+ "ffl": "ffl",
123
+ # Quotation marks and apostrophes
124
+ """: "'",
125
+ """: "'",
126
+ '"': '"',
127
+ "′": "'",
128
+ "‚": ",",
129
+ "„": '"',
130
+ # Dashes and hyphens
131
+ "‒": "-",
132
+ "–": "-",
133
+ "—": "-",
134
+ "―": "-",
135
+ # Other common replacements
136
+ "…": "...",
137
+ "•": "*",
138
+ "°": " degrees ",
139
+ "¹": "1",
140
+ "²": "2",
141
+ "³": "3",
142
+ "©": "(c)",
143
+ "®": "(R)",
144
+ "™": "(TM)",
145
+ }
146
+ for old, new in replacements.items():
147
+ text = text.replace(old, new)
148
+
149
+ # Step 3: Clean control characters while preserving essential whitespace and special chars
150
+ text = "".join(
151
+ char
152
+ for char in text
153
+ if unicodedata.category(char)[0] != "C"
154
+ or char in "\n\t "
155
+ or char in "()%=[]{}#$@!?.,;:+-*/^<>&|~"
156
+ )
157
+
158
+ # Step 4: Enhanced space cleaning
159
+ text = re.sub(r"[ \t]+", " ", text) # Consolidate horizontal whitespace
160
+ text = re.sub(r" +\n", "\n", text) # Remove spaces before newlines
161
+ text = re.sub(r"\n +", "\n", text) # Remove spaces after newlines
162
+ text = re.sub(r"\n\t+", "\n", text) # Remove tabs at start of lines
163
+ text = re.sub(r"\t+\n", "\n", text) # Remove tabs at end of lines
164
+ text = re.sub(r"\t+", " ", text) # Replace tabs with single space
165
+
166
+ # Step 5: Remove empty lines while preserving paragraph structure
167
+ text = re.sub(r"\n{3,}", "\n\n", text) # Max two consecutive newlines
168
+ text = re.sub(r"^\s+", "", text) # Remove leading whitespace
169
+ text = re.sub(r"\s+$", "", text) # Remove trailing whitespace
170
+
171
+ # Step 6: Clean up around punctuation
172
+ text = re.sub(r"\s+([.,;:!?)])", r"\1", text) # Remove spaces before punctuation
173
+ text = re.sub(r"(\()\s+", r"\1", text) # Remove spaces after opening parenthesis
174
+ text = re.sub(
175
+ r"\s+([.,])\s+", r"\1 ", text
176
+ ) # Ensure single space after periods and commas
177
+
178
+ # Step 7: Remove zero-width and invisible characters
179
+ text = re.sub(r"[\u200b\u200c\u200d\ufeff\u200e\u200f]", "", text)
180
+
181
+ # Step 8: Fix hyphenation and line breaks
182
+ text = re.sub(
183
+ r"(?<=\w)-\s*\n\s*(?=\w)", "", text
184
+ ) # Remove hyphenation at line breaks
185
+
186
+ return text.strip()
187
+
188
+
189
+
190
+
191
+ async def _extract_text_from_pdf(pdf_path):
192
+ """Extract text from PDF asynchronously with table detection"""
193
+
194
+ def _extract():
195
+ doc = fitz.open(pdf_path)
196
+ try:
197
+ full_text = []
198
+ logger.debug(f"Found {len(doc)} pages in PDF")
199
+
200
+ # Use quality improvement flags for better text extraction
201
+ extraction_flags = (
202
+ fitz.TEXT_PRESERVE_LIGATURES | # Better character rendering
203
+ fitz.TEXT_PRESERVE_WHITESPACE | # Better spacing preservation
204
+ fitz.TEXT_PRESERVE_IMAGES # Better image-text integration
205
+ )
206
+
207
+ # Get OCR configuration
208
+ ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
209
+ enable_ocr = ocr_config.get('enable_formula_ocr', False)
210
+ formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
211
+ ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
212
+
213
+ for page_num, page in enumerate(doc):
214
+ # Extract regular text with quality flags
215
+ standard_text = page.get_text(flags=extraction_flags)
216
+
217
+ # Check if we should try OCR for this page
218
+ formula_count = count_formula_placeholders(standard_text)
219
+ use_ocr = (enable_ocr and
220
+ formula_count >= formula_threshold and
221
+ formula_count > 0)
222
+
223
+ if use_ocr:
224
+ logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
225
+ ocr_text = extract_page_with_ocr(page, page_num + 1)
226
+
227
+ if ocr_text and ocr_fallback:
228
+ # Use OCR text but preserve table extraction from standard text
229
+ page_text = ocr_text
230
+ logger.debug(f"Using OCR text for page {page_num + 1}")
231
+ else:
232
+ # OCR failed, use standard text
233
+ page_text = standard_text
234
+ if not ocr_text:
235
+ logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
236
+ else:
237
+ page_text = standard_text
238
+
239
+ # Try to find and extract tables (regardless of OCR)
240
+ try:
241
+ tables = page.find_tables()
242
+ if tables:
243
+ logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
244
+
245
+ # For each table found, convert to markdown and append
246
+ for table_num, table in enumerate(tables):
247
+ # Extract table data
248
+ table_data = table.extract()
249
+ # Validate table has actual content (not just empty rows/cells)
250
+ if table_data and len(table_data) > 0 and any(
251
+ any(str(cell).strip() for cell in row if cell) for row in table_data if row
252
+ ):
253
+ # Add a marker before the table
254
+ page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
255
+ # Convert to markdown
256
+ markdown_table = convert_table_to_markdown(table_data)
257
+ page_text += markdown_table + "\n"
258
+ except Exception as e:
259
+ # If table extraction fails, continue with regular text
260
+ logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
261
+
262
+ full_text.append(page_text)
263
+
264
+ # Join all pages and clean
265
+ combined_text = "".join(full_text)
266
+ return clean_pdf_text(combined_text)
267
+ finally:
268
+ doc.close()
269
+
270
+ # Run CPU-bound PDF processing in a thread pool
271
+ return await asyncio.get_event_loop().run_in_executor(None, _extract)
272
+
273
+
274
+ async def extract_pdf(state: ProcessSourceState):
275
+ """
276
+ Parse the PDF file and extract its content asynchronously.
277
+ """
278
+ return_dict = {}
279
+ assert state.file_path, "No file path provided"
280
+ assert state.identified_type in SUPPORTED_FITZ_TYPES, "Unsupported File Type"
281
+
282
+ if state.file_path is not None and state.identified_type in SUPPORTED_FITZ_TYPES:
283
+ file_path = state.file_path
284
+ try:
285
+ text = await _extract_text_from_pdf(file_path)
286
+ return_dict["content"] = text
287
+ except FileNotFoundError:
288
+ raise FileNotFoundError(f"File not found at {file_path}")
289
+ except Exception as e:
290
+ raise Exception(f"An error occurred: {e}")
291
+
292
+ return return_dict
@@ -0,0 +1,36 @@
1
+ import asyncio
2
+
3
+ from content_core.common import ProcessSourceState
4
+ from content_core.logging import logger
5
+
6
+
7
+ async def extract_txt(state: ProcessSourceState):
8
+ """
9
+ Parse the text file and extract its content asynchronously.
10
+ """
11
+ return_dict = {}
12
+ if state.file_path is not None and state.identified_type == "text/plain":
13
+ logger.debug(f"Extracting text from {state.file_path}")
14
+ file_path = state.file_path
15
+
16
+ if file_path is not None:
17
+ try:
18
+
19
+ def _read_file():
20
+ with open(file_path, "r", encoding="utf-8") as file:
21
+ return file.read()
22
+
23
+ # Run file I/O in thread pool
24
+ content = await asyncio.get_event_loop().run_in_executor(
25
+ None, _read_file
26
+ )
27
+
28
+ logger.debug(f"Extracted: {content[:100]}")
29
+ return_dict["content"] = content
30
+
31
+ except FileNotFoundError:
32
+ raise FileNotFoundError(f"File not found at {file_path}")
33
+ except Exception as e:
34
+ raise Exception(f"An error occurred: {e}")
35
+
36
+ return return_dict