skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1166 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF Text Extractor - Complete Feature Set (Tasks B1.2 + B1.3 + B1.4 + B1.5 + Priority 2 & 3)
|
|
4
|
+
|
|
5
|
+
Extracts text, code blocks, and images from PDF documentation files.
|
|
6
|
+
Uses PyMuPDF (fitz) for fast, high-quality extraction.
|
|
7
|
+
|
|
8
|
+
Features:
|
|
9
|
+
- Text and markdown extraction
|
|
10
|
+
- Code block detection (font, indent, pattern)
|
|
11
|
+
- Language detection with confidence scoring (19+ languages) (B1.4)
|
|
12
|
+
- Syntax validation and quality scoring (B1.4)
|
|
13
|
+
- Quality statistics and filtering (B1.4)
|
|
14
|
+
- Image extraction to files (B1.5)
|
|
15
|
+
- Image filtering by size (B1.5)
|
|
16
|
+
- Page chunking and chapter detection (B1.3)
|
|
17
|
+
- Code block merging across pages (B1.3)
|
|
18
|
+
|
|
19
|
+
Advanced Features (Priority 2 & 3):
|
|
20
|
+
- OCR support for scanned PDFs (requires pytesseract) (Priority 2)
|
|
21
|
+
- Password-protected PDF support (Priority 2)
|
|
22
|
+
- Table extraction (Priority 2)
|
|
23
|
+
- Parallel page processing (Priority 3)
|
|
24
|
+
- Caching of expensive operations (Priority 3)
|
|
25
|
+
|
|
26
|
+
Usage:
|
|
27
|
+
# Basic extraction
|
|
28
|
+
python3 pdf_extractor_poc.py input.pdf
|
|
29
|
+
python3 pdf_extractor_poc.py input.pdf --output output.json
|
|
30
|
+
python3 pdf_extractor_poc.py input.pdf --verbose
|
|
31
|
+
|
|
32
|
+
# Quality filtering
|
|
33
|
+
python3 pdf_extractor_poc.py input.pdf --min-quality 5.0
|
|
34
|
+
|
|
35
|
+
# Image extraction
|
|
36
|
+
python3 pdf_extractor_poc.py input.pdf --extract-images
|
|
37
|
+
python3 pdf_extractor_poc.py input.pdf --extract-images --image-dir images/
|
|
38
|
+
|
|
39
|
+
# Advanced features
|
|
40
|
+
python3 pdf_extractor_poc.py scanned.pdf --ocr
|
|
41
|
+
python3 pdf_extractor_poc.py encrypted.pdf --password mypassword
|
|
42
|
+
python3 pdf_extractor_poc.py input.pdf --extract-tables
|
|
43
|
+
python3 pdf_extractor_poc.py large.pdf --parallel --workers 8
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
python3 pdf_extractor_poc.py docs/manual.pdf -o output.json -v \
|
|
47
|
+
--chunk-size 15 --min-quality 6.0 --extract-images \
|
|
48
|
+
--extract-tables --parallel
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
import argparse
|
|
52
|
+
import json
|
|
53
|
+
import os
|
|
54
|
+
import re
|
|
55
|
+
import sys
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
|
|
58
|
+
# Import unified language detector
|
|
59
|
+
from skill_seekers.cli.language_detector import LanguageDetector
|
|
60
|
+
|
|
61
|
+
# Check if PyMuPDF is installed
|
|
62
|
+
try:
|
|
63
|
+
import fitz # PyMuPDF
|
|
64
|
+
except ImportError:
|
|
65
|
+
print("ERROR: PyMuPDF not installed")
|
|
66
|
+
print("Install with: pip install PyMuPDF")
|
|
67
|
+
sys.exit(1)
|
|
68
|
+
|
|
69
|
+
# Optional dependencies for advanced features
|
|
70
|
+
try:
|
|
71
|
+
import pytesseract
|
|
72
|
+
from PIL import Image
|
|
73
|
+
|
|
74
|
+
TESSERACT_AVAILABLE = True
|
|
75
|
+
except ImportError:
|
|
76
|
+
TESSERACT_AVAILABLE = False
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
import concurrent.futures
|
|
80
|
+
|
|
81
|
+
CONCURRENT_AVAILABLE = True
|
|
82
|
+
except ImportError:
|
|
83
|
+
CONCURRENT_AVAILABLE = False
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PDFExtractor:
|
|
87
|
+
"""Extract text and code from PDF documentation"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
pdf_path,
|
|
92
|
+
verbose=False,
|
|
93
|
+
chunk_size=10,
|
|
94
|
+
min_quality=0.0,
|
|
95
|
+
extract_images=False,
|
|
96
|
+
image_dir=None,
|
|
97
|
+
min_image_size=100,
|
|
98
|
+
use_ocr=False,
|
|
99
|
+
password=None,
|
|
100
|
+
extract_tables=False,
|
|
101
|
+
parallel=False,
|
|
102
|
+
max_workers=None,
|
|
103
|
+
use_cache=True,
|
|
104
|
+
):
|
|
105
|
+
self.pdf_path = pdf_path
|
|
106
|
+
self.verbose = verbose
|
|
107
|
+
self.chunk_size = chunk_size # Pages per chunk (0 = no chunking)
|
|
108
|
+
self.min_quality = min_quality # Minimum quality score (0-10)
|
|
109
|
+
self.extract_images = extract_images # Extract images to files (NEW in B1.5)
|
|
110
|
+
self.image_dir = image_dir # Directory to save images (NEW in B1.5)
|
|
111
|
+
self.min_image_size = min_image_size # Minimum image dimension (NEW in B1.5)
|
|
112
|
+
|
|
113
|
+
# Advanced features (Priority 2 & 3)
|
|
114
|
+
self.use_ocr = use_ocr # OCR for scanned PDFs (Priority 2)
|
|
115
|
+
self.password = password # Password for encrypted PDFs (Priority 2)
|
|
116
|
+
self.extract_tables = extract_tables # Extract tables (Priority 2)
|
|
117
|
+
self.parallel = parallel # Parallel processing (Priority 3)
|
|
118
|
+
self.max_workers = max_workers or os.cpu_count() # Worker threads (Priority 3)
|
|
119
|
+
self.use_cache = use_cache # Cache expensive operations (Priority 3)
|
|
120
|
+
|
|
121
|
+
self.doc = None
|
|
122
|
+
self.pages = []
|
|
123
|
+
self.chapters = [] # Detected chapters/sections
|
|
124
|
+
self.extracted_images = [] # List of extracted image info (NEW in B1.5)
|
|
125
|
+
self._cache = {} # Cache for expensive operations (Priority 3)
|
|
126
|
+
|
|
127
|
+
# Language detection
|
|
128
|
+
self.language_detector = LanguageDetector(min_confidence=0.15)
|
|
129
|
+
|
|
130
|
+
def log(self, message):
|
|
131
|
+
"""Print message if verbose mode enabled"""
|
|
132
|
+
if self.verbose:
|
|
133
|
+
print(message)
|
|
134
|
+
|
|
135
|
+
def extract_text_with_ocr(self, page):
|
|
136
|
+
"""
|
|
137
|
+
Extract text from scanned PDF page using OCR (Priority 2).
|
|
138
|
+
Falls back to regular text extraction if OCR is not available.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
page: PyMuPDF page object
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
str: Extracted text
|
|
145
|
+
"""
|
|
146
|
+
# Try regular text extraction first
|
|
147
|
+
text = page.get_text("text").strip()
|
|
148
|
+
|
|
149
|
+
# If page has very little text, it might be scanned
|
|
150
|
+
if len(text) < 50 and self.use_ocr:
|
|
151
|
+
if not TESSERACT_AVAILABLE:
|
|
152
|
+
self.log("⚠️ OCR requested but pytesseract not installed")
|
|
153
|
+
self.log(" Install with: pip install pytesseract Pillow")
|
|
154
|
+
return text
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
# Render page as image
|
|
158
|
+
pix = page.get_pixmap()
|
|
159
|
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
160
|
+
|
|
161
|
+
# Run OCR
|
|
162
|
+
ocr_text = pytesseract.image_to_string(img)
|
|
163
|
+
self.log(f" OCR extracted {len(ocr_text)} chars (was {len(text)})")
|
|
164
|
+
return ocr_text if len(ocr_text) > len(text) else text
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
self.log(f" OCR failed: {e}")
|
|
168
|
+
return text
|
|
169
|
+
|
|
170
|
+
return text
|
|
171
|
+
|
|
172
|
+
def extract_tables_from_page(self, page):
|
|
173
|
+
"""
|
|
174
|
+
Extract tables from PDF page (Priority 2).
|
|
175
|
+
Uses PyMuPDF's table detection.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
page: PyMuPDF page object
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
list: List of extracted tables as dicts
|
|
182
|
+
"""
|
|
183
|
+
if not self.extract_tables:
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
tables = []
|
|
187
|
+
try:
|
|
188
|
+
# PyMuPDF table extraction
|
|
189
|
+
tabs = page.find_tables()
|
|
190
|
+
for idx, tab in enumerate(tabs.tables):
|
|
191
|
+
table_data = {
|
|
192
|
+
"table_index": idx,
|
|
193
|
+
"rows": tab.extract(),
|
|
194
|
+
"bbox": tab.bbox,
|
|
195
|
+
"row_count": len(tab.extract()),
|
|
196
|
+
"col_count": len(tab.extract()[0]) if tab.extract() else 0,
|
|
197
|
+
}
|
|
198
|
+
tables.append(table_data)
|
|
199
|
+
self.log(
|
|
200
|
+
f" Found table {idx}: {table_data['row_count']}x{table_data['col_count']}"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
except Exception as e:
|
|
204
|
+
self.log(f" Table extraction failed: {e}")
|
|
205
|
+
|
|
206
|
+
return tables
|
|
207
|
+
|
|
208
|
+
def get_cached(self, key):
|
|
209
|
+
"""
|
|
210
|
+
Get cached value (Priority 3).
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
key: Cache key
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Cached value or None
|
|
217
|
+
"""
|
|
218
|
+
if not self.use_cache:
|
|
219
|
+
return None
|
|
220
|
+
return self._cache.get(key)
|
|
221
|
+
|
|
222
|
+
def set_cached(self, key, value):
|
|
223
|
+
"""
|
|
224
|
+
Set cached value (Priority 3).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
key: Cache key
|
|
228
|
+
value: Value to cache
|
|
229
|
+
"""
|
|
230
|
+
if self.use_cache:
|
|
231
|
+
self._cache[key] = value
|
|
232
|
+
|
|
233
|
+
def detect_language_from_code(self, code):
|
|
234
|
+
"""
|
|
235
|
+
Detect programming language from code content using patterns.
|
|
236
|
+
Enhanced in B1.4 with confidence scoring.
|
|
237
|
+
|
|
238
|
+
UPDATED: Now uses shared LanguageDetector with 20+ languages
|
|
239
|
+
|
|
240
|
+
Returns (language, confidence) tuple
|
|
241
|
+
"""
|
|
242
|
+
return self.language_detector.detect_from_code(code)
|
|
243
|
+
|
|
244
|
+
def validate_code_syntax(self, code, language):
|
|
245
|
+
"""
|
|
246
|
+
Validate code syntax (basic checks).
|
|
247
|
+
Enhanced in B1.4 with syntax validation.
|
|
248
|
+
|
|
249
|
+
Returns (is_valid, issues) tuple
|
|
250
|
+
"""
|
|
251
|
+
issues = []
|
|
252
|
+
|
|
253
|
+
# Common syntax checks
|
|
254
|
+
if not code.strip():
|
|
255
|
+
return False, ["Empty code block"]
|
|
256
|
+
|
|
257
|
+
# Language-specific validation
|
|
258
|
+
if language == "python":
|
|
259
|
+
# Check indentation consistency
|
|
260
|
+
lines = code.split("\n")
|
|
261
|
+
indent_chars = set()
|
|
262
|
+
for line in lines:
|
|
263
|
+
if line.startswith(" "):
|
|
264
|
+
indent_chars.add("space")
|
|
265
|
+
elif line.startswith("\t"):
|
|
266
|
+
indent_chars.add("tab")
|
|
267
|
+
|
|
268
|
+
if len(indent_chars) > 1:
|
|
269
|
+
issues.append("Mixed tabs and spaces")
|
|
270
|
+
|
|
271
|
+
# Check for unclosed brackets/parens
|
|
272
|
+
open_count = code.count("(") + code.count("[") + code.count("{")
|
|
273
|
+
close_count = code.count(")") + code.count("]") + code.count("}")
|
|
274
|
+
if abs(open_count - close_count) > 2: # Allow small mismatch
|
|
275
|
+
issues.append("Unbalanced brackets")
|
|
276
|
+
|
|
277
|
+
elif language in ["javascript", "java", "cpp", "c", "csharp", "go"]:
|
|
278
|
+
# Check for balanced braces
|
|
279
|
+
open_braces = code.count("{")
|
|
280
|
+
close_braces = code.count("}")
|
|
281
|
+
if abs(open_braces - close_braces) > 1:
|
|
282
|
+
issues.append("Unbalanced braces")
|
|
283
|
+
|
|
284
|
+
elif language == "json":
|
|
285
|
+
# Try to parse JSON
|
|
286
|
+
try:
|
|
287
|
+
json.loads(code)
|
|
288
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
289
|
+
issues.append(f"Invalid JSON syntax: {str(e)[:50]}")
|
|
290
|
+
|
|
291
|
+
# General checks
|
|
292
|
+
# Check if code looks like natural language (too many common words)
|
|
293
|
+
common_words = ["the", "and", "for", "with", "this", "that", "have", "from"]
|
|
294
|
+
word_count = sum(1 for word in common_words if word in code.lower())
|
|
295
|
+
if word_count > 5 and len(code.split()) < 50:
|
|
296
|
+
issues.append("May be natural language, not code")
|
|
297
|
+
|
|
298
|
+
# Check code/comment ratio
|
|
299
|
+
comment_lines = sum(
|
|
300
|
+
1 for line in code.split("\n") if line.strip().startswith(("#", "//", "/*", "*", "--"))
|
|
301
|
+
)
|
|
302
|
+
total_lines = len([line for line in code.split("\n") if line.strip()])
|
|
303
|
+
if total_lines > 0 and comment_lines / total_lines > 0.7:
|
|
304
|
+
issues.append("Mostly comments")
|
|
305
|
+
|
|
306
|
+
return len(issues) == 0, issues
|
|
307
|
+
|
|
308
|
+
def score_code_quality(self, code, language, confidence):
|
|
309
|
+
"""
|
|
310
|
+
Score the quality/usefulness of detected code block.
|
|
311
|
+
New in B1.4.
|
|
312
|
+
|
|
313
|
+
Returns quality score (0-10)
|
|
314
|
+
"""
|
|
315
|
+
score = 5.0 # Start with neutral score
|
|
316
|
+
|
|
317
|
+
# Factor 1: Language detection confidence
|
|
318
|
+
score += confidence * 2.0
|
|
319
|
+
|
|
320
|
+
# Factor 2: Code length (not too short, not too long)
|
|
321
|
+
code_length = len(code.strip())
|
|
322
|
+
if 20 <= code_length <= 500:
|
|
323
|
+
score += 1.0
|
|
324
|
+
elif 500 < code_length <= 2000:
|
|
325
|
+
score += 0.5
|
|
326
|
+
elif code_length < 10:
|
|
327
|
+
score -= 2.0
|
|
328
|
+
|
|
329
|
+
# Factor 3: Number of lines
|
|
330
|
+
lines = [line for line in code.split("\n") if line.strip()]
|
|
331
|
+
if 2 <= len(lines) <= 50:
|
|
332
|
+
score += 1.0
|
|
333
|
+
elif len(lines) > 100:
|
|
334
|
+
score -= 1.0
|
|
335
|
+
|
|
336
|
+
# Factor 4: Has function/class definitions
|
|
337
|
+
if re.search(r"\b(def|function|class|func|fn|public class)\b", code):
|
|
338
|
+
score += 1.5
|
|
339
|
+
|
|
340
|
+
# Factor 5: Has meaningful variable names (not just x, y, i)
|
|
341
|
+
meaningful_vars = re.findall(r"\b[a-z_][a-z0-9_]{3,}\b", code.lower())
|
|
342
|
+
if len(meaningful_vars) >= 2:
|
|
343
|
+
score += 1.0
|
|
344
|
+
|
|
345
|
+
# Factor 6: Syntax validation
|
|
346
|
+
is_valid, issues = self.validate_code_syntax(code, language)
|
|
347
|
+
if is_valid:
|
|
348
|
+
score += 1.0
|
|
349
|
+
else:
|
|
350
|
+
score -= len(issues) * 0.5
|
|
351
|
+
|
|
352
|
+
# Clamp score to 0-10 range
|
|
353
|
+
return max(0, min(10, score))
|
|
354
|
+
|
|
355
|
+
def detect_code_blocks_by_font(self, page):
|
|
356
|
+
"""
|
|
357
|
+
Detect code blocks by analyzing font properties.
|
|
358
|
+
Monospace fonts typically indicate code.
|
|
359
|
+
|
|
360
|
+
Returns list of detected code blocks with metadata.
|
|
361
|
+
"""
|
|
362
|
+
code_blocks = []
|
|
363
|
+
blocks = page.get_text("dict")["blocks"]
|
|
364
|
+
|
|
365
|
+
monospace_fonts = ["courier", "mono", "consolas", "menlo", "monaco", "dejavu"]
|
|
366
|
+
|
|
367
|
+
current_code = []
|
|
368
|
+
current_font = None
|
|
369
|
+
|
|
370
|
+
for block in blocks:
|
|
371
|
+
if "lines" not in block:
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
for line in block["lines"]:
|
|
375
|
+
for span in line["spans"]:
|
|
376
|
+
font = span["font"].lower()
|
|
377
|
+
text = span["text"]
|
|
378
|
+
|
|
379
|
+
# Check if font is monospace
|
|
380
|
+
is_monospace = any(mf in font for mf in monospace_fonts)
|
|
381
|
+
|
|
382
|
+
if is_monospace:
|
|
383
|
+
# Accumulate code text
|
|
384
|
+
current_code.append(text)
|
|
385
|
+
current_font = span["font"]
|
|
386
|
+
else:
|
|
387
|
+
# End of code block
|
|
388
|
+
if current_code:
|
|
389
|
+
code_text = "".join(current_code).strip()
|
|
390
|
+
if len(code_text) > 10: # Minimum code length
|
|
391
|
+
lang, confidence = self.detect_language_from_code(code_text)
|
|
392
|
+
quality = self.score_code_quality(code_text, lang, confidence)
|
|
393
|
+
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
|
394
|
+
|
|
395
|
+
code_blocks.append(
|
|
396
|
+
{
|
|
397
|
+
"code": code_text,
|
|
398
|
+
"language": lang,
|
|
399
|
+
"confidence": confidence,
|
|
400
|
+
"quality_score": quality,
|
|
401
|
+
"is_valid": is_valid,
|
|
402
|
+
"validation_issues": issues if not is_valid else [],
|
|
403
|
+
"font": current_font,
|
|
404
|
+
"detection_method": "font",
|
|
405
|
+
}
|
|
406
|
+
)
|
|
407
|
+
current_code = []
|
|
408
|
+
current_font = None
|
|
409
|
+
|
|
410
|
+
# Handle final code block
|
|
411
|
+
if current_code:
|
|
412
|
+
code_text = "".join(current_code).strip()
|
|
413
|
+
if len(code_text) > 10:
|
|
414
|
+
lang, confidence = self.detect_language_from_code(code_text)
|
|
415
|
+
quality = self.score_code_quality(code_text, lang, confidence)
|
|
416
|
+
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
|
417
|
+
|
|
418
|
+
code_blocks.append(
|
|
419
|
+
{
|
|
420
|
+
"code": code_text,
|
|
421
|
+
"language": lang,
|
|
422
|
+
"confidence": confidence,
|
|
423
|
+
"quality_score": quality,
|
|
424
|
+
"is_valid": is_valid,
|
|
425
|
+
"validation_issues": issues if not is_valid else [],
|
|
426
|
+
"font": current_font,
|
|
427
|
+
"detection_method": "font",
|
|
428
|
+
}
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
return code_blocks
|
|
432
|
+
|
|
433
|
+
def detect_code_blocks_by_indent(self, text):
|
|
434
|
+
"""
|
|
435
|
+
Detect code blocks by indentation patterns.
|
|
436
|
+
Code often has consistent indentation.
|
|
437
|
+
|
|
438
|
+
Returns list of detected code blocks.
|
|
439
|
+
"""
|
|
440
|
+
code_blocks = []
|
|
441
|
+
lines = text.split("\n")
|
|
442
|
+
current_block = []
|
|
443
|
+
indent_pattern = None
|
|
444
|
+
|
|
445
|
+
for line in lines:
|
|
446
|
+
# Check for indentation (4 spaces or tab)
|
|
447
|
+
if line.startswith(" ") or line.startswith("\t"):
|
|
448
|
+
# Start or continue code block
|
|
449
|
+
if not indent_pattern:
|
|
450
|
+
indent_pattern = line[:4] if line.startswith(" ") else "\t"
|
|
451
|
+
current_block.append(line)
|
|
452
|
+
else:
|
|
453
|
+
# End of code block
|
|
454
|
+
if current_block and len(current_block) >= 2: # At least 2 lines
|
|
455
|
+
code_text = "\n".join(current_block).strip()
|
|
456
|
+
if len(code_text) > 20: # Minimum code length
|
|
457
|
+
lang, confidence = self.detect_language_from_code(code_text)
|
|
458
|
+
quality = self.score_code_quality(code_text, lang, confidence)
|
|
459
|
+
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
|
460
|
+
|
|
461
|
+
code_blocks.append(
|
|
462
|
+
{
|
|
463
|
+
"code": code_text,
|
|
464
|
+
"language": lang,
|
|
465
|
+
"confidence": confidence,
|
|
466
|
+
"quality_score": quality,
|
|
467
|
+
"is_valid": is_valid,
|
|
468
|
+
"validation_issues": issues if not is_valid else [],
|
|
469
|
+
"detection_method": "indent",
|
|
470
|
+
}
|
|
471
|
+
)
|
|
472
|
+
current_block = []
|
|
473
|
+
indent_pattern = None
|
|
474
|
+
|
|
475
|
+
# Handle final block
|
|
476
|
+
if current_block and len(current_block) >= 2:
|
|
477
|
+
code_text = "\n".join(current_block).strip()
|
|
478
|
+
if len(code_text) > 20:
|
|
479
|
+
lang, confidence = self.detect_language_from_code(code_text)
|
|
480
|
+
quality = self.score_code_quality(code_text, lang, confidence)
|
|
481
|
+
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
|
482
|
+
|
|
483
|
+
code_blocks.append(
|
|
484
|
+
{
|
|
485
|
+
"code": code_text,
|
|
486
|
+
"language": lang,
|
|
487
|
+
"confidence": confidence,
|
|
488
|
+
"quality_score": quality,
|
|
489
|
+
"is_valid": is_valid,
|
|
490
|
+
"validation_issues": issues if not is_valid else [],
|
|
491
|
+
"detection_method": "indent",
|
|
492
|
+
}
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return code_blocks
|
|
496
|
+
|
|
497
|
+
def detect_code_blocks_by_pattern(self, text):
|
|
498
|
+
"""
|
|
499
|
+
Detect code blocks by common code patterns (keywords, syntax).
|
|
500
|
+
|
|
501
|
+
Returns list of detected code snippets.
|
|
502
|
+
"""
|
|
503
|
+
code_blocks = []
|
|
504
|
+
|
|
505
|
+
# Common code patterns that span multiple lines
|
|
506
|
+
patterns = [
|
|
507
|
+
# Function definitions
|
|
508
|
+
(
|
|
509
|
+
r"((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)",
|
|
510
|
+
"function",
|
|
511
|
+
),
|
|
512
|
+
# Class definitions
|
|
513
|
+
(r"(class\s+\w+[^{]*\{[^}]*\})", "class"),
|
|
514
|
+
# Import statements block
|
|
515
|
+
(
|
|
516
|
+
r"((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)",
|
|
517
|
+
"imports",
|
|
518
|
+
),
|
|
519
|
+
]
|
|
520
|
+
|
|
521
|
+
for pattern, block_type in patterns:
|
|
522
|
+
matches = re.finditer(pattern, text, re.MULTILINE | re.DOTALL)
|
|
523
|
+
for match in matches:
|
|
524
|
+
code_text = match.group(1).strip()
|
|
525
|
+
if len(code_text) > 15:
|
|
526
|
+
lang, confidence = self.detect_language_from_code(code_text)
|
|
527
|
+
quality = self.score_code_quality(code_text, lang, confidence)
|
|
528
|
+
is_valid, issues = self.validate_code_syntax(code_text, lang)
|
|
529
|
+
|
|
530
|
+
code_blocks.append(
|
|
531
|
+
{
|
|
532
|
+
"code": code_text,
|
|
533
|
+
"language": lang,
|
|
534
|
+
"confidence": confidence,
|
|
535
|
+
"quality_score": quality,
|
|
536
|
+
"is_valid": is_valid,
|
|
537
|
+
"validation_issues": issues if not is_valid else [],
|
|
538
|
+
"detection_method": "pattern",
|
|
539
|
+
"pattern_type": block_type,
|
|
540
|
+
}
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
return code_blocks
|
|
544
|
+
|
|
545
|
+
def detect_chapter_start(self, page_data):
|
|
546
|
+
"""
|
|
547
|
+
Detect if a page starts a new chapter/section.
|
|
548
|
+
|
|
549
|
+
Returns (is_chapter_start, chapter_title) tuple.
|
|
550
|
+
"""
|
|
551
|
+
headings = page_data.get("headings", [])
|
|
552
|
+
|
|
553
|
+
# Check for h1 or h2 at start of page
|
|
554
|
+
if headings:
|
|
555
|
+
first_heading = headings[0]
|
|
556
|
+
# H1 headings are strong indicators of chapters
|
|
557
|
+
if first_heading["level"] in ["h1", "h2"]:
|
|
558
|
+
return True, first_heading["text"]
|
|
559
|
+
|
|
560
|
+
# Check for specific chapter markers in text
|
|
561
|
+
text = page_data.get("text", "")
|
|
562
|
+
first_line = text.split("\n")[0] if text else ""
|
|
563
|
+
|
|
564
|
+
chapter_patterns = [
|
|
565
|
+
r"^Chapter\s+\d+",
|
|
566
|
+
r"^Part\s+\d+",
|
|
567
|
+
r"^Section\s+\d+",
|
|
568
|
+
r"^\d+\.\s+[A-Z]", # "1. Introduction"
|
|
569
|
+
]
|
|
570
|
+
|
|
571
|
+
for pattern in chapter_patterns:
|
|
572
|
+
if re.match(pattern, first_line, re.IGNORECASE):
|
|
573
|
+
return True, first_line.strip()
|
|
574
|
+
|
|
575
|
+
return False, None
|
|
576
|
+
|
|
577
|
+
def merge_continued_code_blocks(self, pages):
|
|
578
|
+
"""
|
|
579
|
+
Merge code blocks that are split across pages.
|
|
580
|
+
|
|
581
|
+
Detects when a code block at the end of one page continues
|
|
582
|
+
on the next page.
|
|
583
|
+
"""
|
|
584
|
+
for i in range(len(pages) - 1):
|
|
585
|
+
current_page = pages[i]
|
|
586
|
+
next_page = pages[i + 1]
|
|
587
|
+
|
|
588
|
+
# Check if current page has code blocks
|
|
589
|
+
if not current_page["code_samples"]:
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
# Get last code block of current page
|
|
593
|
+
last_code = current_page["code_samples"][-1]
|
|
594
|
+
|
|
595
|
+
# Check if next page starts with code
|
|
596
|
+
if not next_page["code_samples"]:
|
|
597
|
+
continue
|
|
598
|
+
|
|
599
|
+
first_next_code = next_page["code_samples"][0]
|
|
600
|
+
|
|
601
|
+
# Same language and detection method = likely continuation
|
|
602
|
+
if (
|
|
603
|
+
last_code["language"] == first_next_code["language"]
|
|
604
|
+
and last_code["detection_method"] == first_next_code["detection_method"]
|
|
605
|
+
):
|
|
606
|
+
# Check if last code block looks incomplete (doesn't end with closing brace/etc)
|
|
607
|
+
last_code_text = last_code["code"].rstrip()
|
|
608
|
+
continuation_indicators = [
|
|
609
|
+
not last_code_text.endswith("}"),
|
|
610
|
+
not last_code_text.endswith(";"),
|
|
611
|
+
last_code_text.endswith(","),
|
|
612
|
+
last_code_text.endswith("\\"),
|
|
613
|
+
]
|
|
614
|
+
|
|
615
|
+
if any(continuation_indicators):
|
|
616
|
+
# Merge the code blocks
|
|
617
|
+
merged_code = last_code["code"] + "\n" + first_next_code["code"]
|
|
618
|
+
last_code["code"] = merged_code
|
|
619
|
+
last_code["merged_from_next_page"] = True
|
|
620
|
+
|
|
621
|
+
# Remove the first code block from next page
|
|
622
|
+
next_page["code_samples"].pop(0)
|
|
623
|
+
next_page["code_blocks_count"] -= 1
|
|
624
|
+
|
|
625
|
+
self.log(f" Merged code block from page {i + 1} to {i + 2}")
|
|
626
|
+
|
|
627
|
+
return pages
|
|
628
|
+
|
|
629
|
+
def create_chunks(self, pages):
|
|
630
|
+
"""
|
|
631
|
+
Create chunks of pages for better organization.
|
|
632
|
+
|
|
633
|
+
Returns array of chunks, each containing:
|
|
634
|
+
- chunk_number
|
|
635
|
+
- start_page, end_page
|
|
636
|
+
- pages (array)
|
|
637
|
+
- chapter_title (if detected)
|
|
638
|
+
"""
|
|
639
|
+
if self.chunk_size == 0:
|
|
640
|
+
# No chunking - return all pages as one chunk
|
|
641
|
+
return [
|
|
642
|
+
{
|
|
643
|
+
"chunk_number": 1,
|
|
644
|
+
"start_page": 1,
|
|
645
|
+
"end_page": len(pages),
|
|
646
|
+
"pages": pages,
|
|
647
|
+
"chapter_title": None,
|
|
648
|
+
}
|
|
649
|
+
]
|
|
650
|
+
|
|
651
|
+
chunks = []
|
|
652
|
+
current_chunk = []
|
|
653
|
+
chunk_start = 0
|
|
654
|
+
current_chapter = None
|
|
655
|
+
|
|
656
|
+
for i, page in enumerate(pages):
|
|
657
|
+
# Check if this page starts a new chapter
|
|
658
|
+
is_chapter, chapter_title = self.detect_chapter_start(page)
|
|
659
|
+
|
|
660
|
+
if is_chapter and current_chunk:
|
|
661
|
+
# Save current chunk before starting new one
|
|
662
|
+
chunks.append(
|
|
663
|
+
{
|
|
664
|
+
"chunk_number": len(chunks) + 1,
|
|
665
|
+
"start_page": chunk_start + 1,
|
|
666
|
+
"end_page": i,
|
|
667
|
+
"pages": current_chunk,
|
|
668
|
+
"chapter_title": current_chapter,
|
|
669
|
+
}
|
|
670
|
+
)
|
|
671
|
+
current_chunk = []
|
|
672
|
+
chunk_start = i
|
|
673
|
+
current_chapter = chapter_title
|
|
674
|
+
|
|
675
|
+
if not current_chapter and is_chapter:
|
|
676
|
+
current_chapter = chapter_title
|
|
677
|
+
|
|
678
|
+
current_chunk.append(page)
|
|
679
|
+
|
|
680
|
+
# Check if chunk size reached (but don't break chapters)
|
|
681
|
+
if not is_chapter and len(current_chunk) >= self.chunk_size:
|
|
682
|
+
chunks.append(
|
|
683
|
+
{
|
|
684
|
+
"chunk_number": len(chunks) + 1,
|
|
685
|
+
"start_page": chunk_start + 1,
|
|
686
|
+
"end_page": i + 1,
|
|
687
|
+
"pages": current_chunk,
|
|
688
|
+
"chapter_title": current_chapter,
|
|
689
|
+
}
|
|
690
|
+
)
|
|
691
|
+
current_chunk = []
|
|
692
|
+
chunk_start = i + 1
|
|
693
|
+
current_chapter = None
|
|
694
|
+
|
|
695
|
+
# Add remaining pages as final chunk
|
|
696
|
+
if current_chunk:
|
|
697
|
+
chunks.append(
|
|
698
|
+
{
|
|
699
|
+
"chunk_number": len(chunks) + 1,
|
|
700
|
+
"start_page": chunk_start + 1,
|
|
701
|
+
"end_page": len(pages),
|
|
702
|
+
"pages": current_chunk,
|
|
703
|
+
"chapter_title": current_chapter,
|
|
704
|
+
}
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
return chunks
|
|
708
|
+
|
|
709
|
+
def extract_images_from_page(self, page, page_num):
|
|
710
|
+
"""
|
|
711
|
+
Extract images from a PDF page and save to disk (NEW in B1.5).
|
|
712
|
+
|
|
713
|
+
Returns list of extracted image metadata.
|
|
714
|
+
"""
|
|
715
|
+
if not self.extract_images:
|
|
716
|
+
# Just count images, don't extract
|
|
717
|
+
return []
|
|
718
|
+
|
|
719
|
+
extracted = []
|
|
720
|
+
image_list = page.get_images()
|
|
721
|
+
|
|
722
|
+
for img_index, img in enumerate(image_list):
|
|
723
|
+
try:
|
|
724
|
+
xref = img[0] # Image XREF number
|
|
725
|
+
base_image = self.doc.extract_image(xref)
|
|
726
|
+
|
|
727
|
+
if not base_image:
|
|
728
|
+
continue
|
|
729
|
+
|
|
730
|
+
image_bytes = base_image["image"]
|
|
731
|
+
image_ext = base_image["ext"] # png, jpeg, etc.
|
|
732
|
+
width = base_image.get("width", 0)
|
|
733
|
+
height = base_image.get("height", 0)
|
|
734
|
+
|
|
735
|
+
# Filter out small images (icons, bullets, etc.)
|
|
736
|
+
if width < self.min_image_size or height < self.min_image_size:
|
|
737
|
+
self.log(f" Skipping small image: {width}x{height}")
|
|
738
|
+
continue
|
|
739
|
+
|
|
740
|
+
# Generate filename
|
|
741
|
+
pdf_basename = Path(self.pdf_path).stem
|
|
742
|
+
image_filename = f"{pdf_basename}_page{page_num + 1}_img{img_index + 1}.{image_ext}"
|
|
743
|
+
|
|
744
|
+
# Save image
|
|
745
|
+
image_path = Path(self.image_dir) / image_filename
|
|
746
|
+
image_path.parent.mkdir(parents=True, exist_ok=True)
|
|
747
|
+
|
|
748
|
+
with open(image_path, "wb") as f:
|
|
749
|
+
f.write(image_bytes)
|
|
750
|
+
|
|
751
|
+
# Store metadata
|
|
752
|
+
image_info = {
|
|
753
|
+
"filename": image_filename,
|
|
754
|
+
"path": str(image_path),
|
|
755
|
+
"page_number": page_num + 1,
|
|
756
|
+
"width": width,
|
|
757
|
+
"height": height,
|
|
758
|
+
"format": image_ext,
|
|
759
|
+
"size_bytes": len(image_bytes),
|
|
760
|
+
"xref": xref,
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
extracted.append(image_info)
|
|
764
|
+
self.extracted_images.append(image_info)
|
|
765
|
+
self.log(f" Extracted image: {image_filename} ({width}x{height})")
|
|
766
|
+
|
|
767
|
+
except Exception as e:
|
|
768
|
+
self.log(f" Error extracting image {img_index}: {e}")
|
|
769
|
+
continue
|
|
770
|
+
|
|
771
|
+
return extracted
|
|
772
|
+
|
|
773
|
+
def extract_page(self, page_num):
|
|
774
|
+
"""
|
|
775
|
+
Extract content from a single PDF page.
|
|
776
|
+
|
|
777
|
+
Returns dict with page content, code blocks, and metadata.
|
|
778
|
+
"""
|
|
779
|
+
# Check cache first (Priority 3)
|
|
780
|
+
cache_key = f"page_{page_num}"
|
|
781
|
+
cached = self.get_cached(cache_key)
|
|
782
|
+
if cached is not None:
|
|
783
|
+
self.log(f" Page {page_num + 1}: Using cached data")
|
|
784
|
+
return cached
|
|
785
|
+
|
|
786
|
+
page = self.doc.load_page(page_num)
|
|
787
|
+
|
|
788
|
+
# Extract plain text (with OCR if enabled - Priority 2)
|
|
789
|
+
text = self.extract_text_with_ocr(page) if self.use_ocr else page.get_text("text")
|
|
790
|
+
|
|
791
|
+
# Extract markdown (better structure preservation)
|
|
792
|
+
markdown = page.get_text("markdown")
|
|
793
|
+
|
|
794
|
+
# Extract tables (Priority 2)
|
|
795
|
+
tables = self.extract_tables_from_page(page)
|
|
796
|
+
|
|
797
|
+
# Get page images (for diagrams)
|
|
798
|
+
images = page.get_images()
|
|
799
|
+
|
|
800
|
+
# Extract images to files (NEW in B1.5)
|
|
801
|
+
extracted_images = self.extract_images_from_page(page, page_num)
|
|
802
|
+
|
|
803
|
+
# Detect code blocks using multiple methods
|
|
804
|
+
font_code_blocks = self.detect_code_blocks_by_font(page)
|
|
805
|
+
indent_code_blocks = self.detect_code_blocks_by_indent(text)
|
|
806
|
+
pattern_code_blocks = self.detect_code_blocks_by_pattern(text)
|
|
807
|
+
|
|
808
|
+
# Merge and deduplicate code blocks
|
|
809
|
+
all_code_blocks = font_code_blocks + indent_code_blocks + pattern_code_blocks
|
|
810
|
+
|
|
811
|
+
# Simple deduplication by code content
|
|
812
|
+
unique_code = {}
|
|
813
|
+
for block in all_code_blocks:
|
|
814
|
+
code_hash = hash(block["code"])
|
|
815
|
+
if code_hash not in unique_code:
|
|
816
|
+
unique_code[code_hash] = block
|
|
817
|
+
else:
|
|
818
|
+
# Keep the one with higher quality score
|
|
819
|
+
if block["quality_score"] > unique_code[code_hash]["quality_score"]:
|
|
820
|
+
unique_code[code_hash] = block
|
|
821
|
+
|
|
822
|
+
code_samples = list(unique_code.values())
|
|
823
|
+
|
|
824
|
+
# Filter by minimum quality (NEW in B1.4)
|
|
825
|
+
if self.min_quality > 0:
|
|
826
|
+
code_samples_before = len(code_samples)
|
|
827
|
+
code_samples = [c for c in code_samples if c["quality_score"] >= self.min_quality]
|
|
828
|
+
filtered_count = code_samples_before - len(code_samples)
|
|
829
|
+
if filtered_count > 0:
|
|
830
|
+
self.log(
|
|
831
|
+
f" Filtered out {filtered_count} low-quality code blocks (min_quality={self.min_quality})"
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
# Sort by quality score (highest first)
|
|
835
|
+
code_samples.sort(key=lambda x: x["quality_score"], reverse=True)
|
|
836
|
+
|
|
837
|
+
# Extract headings from markdown
|
|
838
|
+
headings = []
|
|
839
|
+
for line in markdown.split("\n"):
|
|
840
|
+
if line.startswith("#"):
|
|
841
|
+
level = len(line) - len(line.lstrip("#"))
|
|
842
|
+
text = line.lstrip("#").strip()
|
|
843
|
+
if text:
|
|
844
|
+
headings.append({"level": f"h{level}", "text": text})
|
|
845
|
+
|
|
846
|
+
page_data = {
|
|
847
|
+
"page_number": page_num + 1, # 1-indexed for humans
|
|
848
|
+
"text": text.strip(),
|
|
849
|
+
"markdown": markdown.strip(),
|
|
850
|
+
"headings": headings,
|
|
851
|
+
"code_samples": code_samples,
|
|
852
|
+
"images_count": len(images),
|
|
853
|
+
"extracted_images": extracted_images, # NEW in B1.5
|
|
854
|
+
"tables": tables, # NEW in Priority 2
|
|
855
|
+
"char_count": len(text),
|
|
856
|
+
"code_blocks_count": len(code_samples),
|
|
857
|
+
"tables_count": len(tables), # NEW in Priority 2
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
# Cache the result (Priority 3)
|
|
861
|
+
self.set_cached(cache_key, page_data)
|
|
862
|
+
|
|
863
|
+
self.log(
|
|
864
|
+
f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables"
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
return page_data
|
|
868
|
+
|
|
869
|
+
def extract_all(self):
|
|
870
|
+
"""
|
|
871
|
+
Extract content from all pages of the PDF.
|
|
872
|
+
Enhanced with password support and parallel processing.
|
|
873
|
+
|
|
874
|
+
Returns dict with metadata and pages array.
|
|
875
|
+
"""
|
|
876
|
+
print(f"\n📄 Extracting from: {self.pdf_path}")
|
|
877
|
+
|
|
878
|
+
# Open PDF (with password support - Priority 2)
|
|
879
|
+
try:
|
|
880
|
+
self.doc = fitz.open(self.pdf_path)
|
|
881
|
+
|
|
882
|
+
# Handle encrypted PDFs (Priority 2)
|
|
883
|
+
if self.doc.is_encrypted:
|
|
884
|
+
if self.password:
|
|
885
|
+
print(" 🔐 PDF is encrypted, trying password...")
|
|
886
|
+
if self.doc.authenticate(self.password):
|
|
887
|
+
print(" ✅ Password accepted")
|
|
888
|
+
else:
|
|
889
|
+
print(" ❌ Invalid password")
|
|
890
|
+
return None
|
|
891
|
+
else:
|
|
892
|
+
print(" ❌ PDF is encrypted but no password provided")
|
|
893
|
+
print(" Use --password option to provide password")
|
|
894
|
+
return None
|
|
895
|
+
|
|
896
|
+
except Exception as e:
|
|
897
|
+
print(f"❌ Error opening PDF: {e}")
|
|
898
|
+
return None
|
|
899
|
+
|
|
900
|
+
print(f" Pages: {len(self.doc)}")
|
|
901
|
+
print(f" Metadata: {self.doc.metadata}")
|
|
902
|
+
|
|
903
|
+
# Set up image directory (NEW in B1.5)
|
|
904
|
+
if self.extract_images and not self.image_dir:
|
|
905
|
+
pdf_basename = Path(self.pdf_path).stem
|
|
906
|
+
self.image_dir = f"output/{pdf_basename}_images"
|
|
907
|
+
print(f" Image directory: {self.image_dir}")
|
|
908
|
+
|
|
909
|
+
# Show feature status
|
|
910
|
+
if self.use_ocr:
|
|
911
|
+
status = (
|
|
912
|
+
"✅ enabled" if TESSERACT_AVAILABLE else "⚠️ not available (install pytesseract)"
|
|
913
|
+
)
|
|
914
|
+
print(f" OCR: {status}")
|
|
915
|
+
if self.extract_tables:
|
|
916
|
+
print(" Table extraction: ✅ enabled")
|
|
917
|
+
if self.parallel:
|
|
918
|
+
status = "✅ enabled" if CONCURRENT_AVAILABLE else "⚠️ not available"
|
|
919
|
+
print(f" Parallel processing: {status} ({self.max_workers} workers)")
|
|
920
|
+
if self.use_cache:
|
|
921
|
+
print(" Caching: ✅ enabled")
|
|
922
|
+
|
|
923
|
+
print("")
|
|
924
|
+
|
|
925
|
+
# Extract each page (with parallel processing - Priority 3)
|
|
926
|
+
if self.parallel and CONCURRENT_AVAILABLE and len(self.doc) > 5:
|
|
927
|
+
print(
|
|
928
|
+
f"🚀 Extracting {len(self.doc)} pages in parallel ({self.max_workers} workers)..."
|
|
929
|
+
)
|
|
930
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
931
|
+
page_numbers = list(range(len(self.doc)))
|
|
932
|
+
self.pages = list(executor.map(self.extract_page, page_numbers))
|
|
933
|
+
else:
|
|
934
|
+
# Sequential extraction
|
|
935
|
+
for page_num in range(len(self.doc)):
|
|
936
|
+
page_data = self.extract_page(page_num)
|
|
937
|
+
self.pages.append(page_data)
|
|
938
|
+
|
|
939
|
+
# Merge code blocks that span across pages
|
|
940
|
+
self.log("\n🔗 Merging code blocks across pages...")
|
|
941
|
+
self.pages = self.merge_continued_code_blocks(self.pages)
|
|
942
|
+
|
|
943
|
+
# Create chunks
|
|
944
|
+
self.log(f"\n📦 Creating chunks (chunk_size={self.chunk_size})...")
|
|
945
|
+
chunks = self.create_chunks(self.pages)
|
|
946
|
+
|
|
947
|
+
# Build summary
|
|
948
|
+
total_chars = sum(p["char_count"] for p in self.pages)
|
|
949
|
+
total_code_blocks = sum(p["code_blocks_count"] for p in self.pages)
|
|
950
|
+
total_headings = sum(len(p["headings"]) for p in self.pages)
|
|
951
|
+
total_images = sum(p["images_count"] for p in self.pages)
|
|
952
|
+
total_tables = sum(p["tables_count"] for p in self.pages) # NEW in Priority 2
|
|
953
|
+
|
|
954
|
+
# Detect languages used
|
|
955
|
+
languages = {}
|
|
956
|
+
all_code_blocks_list = []
|
|
957
|
+
for page in self.pages:
|
|
958
|
+
for code in page["code_samples"]:
|
|
959
|
+
lang = code["language"]
|
|
960
|
+
languages[lang] = languages.get(lang, 0) + 1
|
|
961
|
+
all_code_blocks_list.append(code)
|
|
962
|
+
|
|
963
|
+
# Calculate quality statistics (NEW in B1.4)
|
|
964
|
+
quality_stats = {}
|
|
965
|
+
if all_code_blocks_list:
|
|
966
|
+
quality_scores = [c["quality_score"] for c in all_code_blocks_list]
|
|
967
|
+
confidences = [c["confidence"] for c in all_code_blocks_list]
|
|
968
|
+
valid_count = sum(1 for c in all_code_blocks_list if c["is_valid"])
|
|
969
|
+
|
|
970
|
+
quality_stats = {
|
|
971
|
+
"average_quality": sum(quality_scores) / len(quality_scores),
|
|
972
|
+
"average_confidence": sum(confidences) / len(confidences),
|
|
973
|
+
"valid_code_blocks": valid_count,
|
|
974
|
+
"invalid_code_blocks": total_code_blocks - valid_count,
|
|
975
|
+
"validation_rate": valid_count / total_code_blocks if total_code_blocks > 0 else 0,
|
|
976
|
+
"high_quality_blocks": sum(1 for s in quality_scores if s >= 7.0),
|
|
977
|
+
"medium_quality_blocks": sum(1 for s in quality_scores if 4.0 <= s < 7.0),
|
|
978
|
+
"low_quality_blocks": sum(1 for s in quality_scores if s < 4.0),
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
# Extract chapter information
|
|
982
|
+
chapters = []
|
|
983
|
+
for chunk in chunks:
|
|
984
|
+
if chunk["chapter_title"]:
|
|
985
|
+
chapters.append(
|
|
986
|
+
{
|
|
987
|
+
"title": chunk["chapter_title"],
|
|
988
|
+
"start_page": chunk["start_page"],
|
|
989
|
+
"end_page": chunk["end_page"],
|
|
990
|
+
}
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
result = {
|
|
994
|
+
"source_file": self.pdf_path,
|
|
995
|
+
"metadata": self.doc.metadata,
|
|
996
|
+
"total_pages": len(self.doc),
|
|
997
|
+
"total_chars": total_chars,
|
|
998
|
+
"total_code_blocks": total_code_blocks,
|
|
999
|
+
"total_headings": total_headings,
|
|
1000
|
+
"total_images": total_images,
|
|
1001
|
+
"total_extracted_images": len(self.extracted_images), # NEW in B1.5
|
|
1002
|
+
"total_tables": total_tables, # NEW in Priority 2
|
|
1003
|
+
"image_directory": self.image_dir if self.extract_images else None, # NEW in B1.5
|
|
1004
|
+
"extracted_images": self.extracted_images, # NEW in B1.5
|
|
1005
|
+
"total_chunks": len(chunks),
|
|
1006
|
+
"chapters": chapters,
|
|
1007
|
+
"languages_detected": languages,
|
|
1008
|
+
"quality_statistics": quality_stats, # NEW in B1.4
|
|
1009
|
+
"chunks": chunks,
|
|
1010
|
+
"pages": self.pages, # Still include all pages for compatibility
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
# Close document
|
|
1014
|
+
self.doc.close()
|
|
1015
|
+
|
|
1016
|
+
print("\n✅ Extraction complete:")
|
|
1017
|
+
print(f" Total characters: {total_chars:,}")
|
|
1018
|
+
print(f" Code blocks found: {total_code_blocks}")
|
|
1019
|
+
print(f" Headings found: {total_headings}")
|
|
1020
|
+
print(f" Images found: {total_images}")
|
|
1021
|
+
if self.extract_images:
|
|
1022
|
+
print(f" Images extracted: {len(self.extracted_images)}")
|
|
1023
|
+
if self.image_dir:
|
|
1024
|
+
print(f" Image directory: {self.image_dir}")
|
|
1025
|
+
if self.extract_tables:
|
|
1026
|
+
print(f" Tables found: {total_tables}")
|
|
1027
|
+
print(f" Chunks created: {len(chunks)}")
|
|
1028
|
+
print(f" Chapters detected: {len(chapters)}")
|
|
1029
|
+
print(f" Languages detected: {', '.join(languages.keys())}")
|
|
1030
|
+
|
|
1031
|
+
# Print quality statistics (NEW in B1.4)
|
|
1032
|
+
if quality_stats:
|
|
1033
|
+
print("\n📊 Code Quality Statistics:")
|
|
1034
|
+
print(f" Average quality: {quality_stats['average_quality']:.1f}/10")
|
|
1035
|
+
print(f" Average confidence: {quality_stats['average_confidence']:.1%}")
|
|
1036
|
+
print(
|
|
1037
|
+
f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})"
|
|
1038
|
+
)
|
|
1039
|
+
print(f" High quality (7+): {quality_stats['high_quality_blocks']}")
|
|
1040
|
+
print(f" Medium quality (4-7): {quality_stats['medium_quality_blocks']}")
|
|
1041
|
+
print(f" Low quality (<4): {quality_stats['low_quality_blocks']}")
|
|
1042
|
+
|
|
1043
|
+
return result
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def main():
|
|
1047
|
+
parser = argparse.ArgumentParser(
|
|
1048
|
+
description="Extract text and code blocks from PDF documentation",
|
|
1049
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1050
|
+
epilog="""
|
|
1051
|
+
Examples:
|
|
1052
|
+
# Extract from PDF
|
|
1053
|
+
python3 pdf_extractor_poc.py input.pdf
|
|
1054
|
+
|
|
1055
|
+
# Save to JSON file
|
|
1056
|
+
python3 pdf_extractor_poc.py input.pdf --output result.json
|
|
1057
|
+
|
|
1058
|
+
# Verbose mode
|
|
1059
|
+
python3 pdf_extractor_poc.py input.pdf --verbose
|
|
1060
|
+
|
|
1061
|
+
# Extract and save
|
|
1062
|
+
python3 pdf_extractor_poc.py docs/python.pdf -o python_extracted.json -v
|
|
1063
|
+
""",
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
parser.add_argument("pdf_file", help="Path to PDF file to extract")
|
|
1067
|
+
parser.add_argument("-o", "--output", help="Output JSON file path (default: print to stdout)")
|
|
1068
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
1069
|
+
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
|
|
1070
|
+
parser.add_argument(
|
|
1071
|
+
"--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)"
|
|
1072
|
+
)
|
|
1073
|
+
parser.add_argument(
|
|
1074
|
+
"--no-merge", action="store_true", help="Disable merging code blocks across pages"
|
|
1075
|
+
)
|
|
1076
|
+
parser.add_argument(
|
|
1077
|
+
"--min-quality",
|
|
1078
|
+
type=float,
|
|
1079
|
+
default=0.0,
|
|
1080
|
+
help="Minimum code quality score (0-10, default: 0 = no filtering)",
|
|
1081
|
+
)
|
|
1082
|
+
parser.add_argument(
|
|
1083
|
+
"--extract-images", action="store_true", help="Extract images to files (NEW in B1.5)"
|
|
1084
|
+
)
|
|
1085
|
+
parser.add_argument(
|
|
1086
|
+
"--image-dir",
|
|
1087
|
+
type=str,
|
|
1088
|
+
default=None,
|
|
1089
|
+
help="Directory to save extracted images (default: output/{pdf_name}_images)",
|
|
1090
|
+
)
|
|
1091
|
+
parser.add_argument(
|
|
1092
|
+
"--min-image-size",
|
|
1093
|
+
type=int,
|
|
1094
|
+
default=100,
|
|
1095
|
+
help="Minimum image dimension in pixels (filters icons, default: 100)",
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
# Advanced features (Priority 2 & 3)
|
|
1099
|
+
parser.add_argument(
|
|
1100
|
+
"--ocr", action="store_true", help="Use OCR for scanned PDFs (requires pytesseract)"
|
|
1101
|
+
)
|
|
1102
|
+
parser.add_argument("--password", type=str, default=None, help="Password for encrypted PDF")
|
|
1103
|
+
parser.add_argument(
|
|
1104
|
+
"--extract-tables", action="store_true", help="Extract tables from PDF (Priority 2)"
|
|
1105
|
+
)
|
|
1106
|
+
parser.add_argument(
|
|
1107
|
+
"--parallel", action="store_true", help="Process pages in parallel (Priority 3)"
|
|
1108
|
+
)
|
|
1109
|
+
parser.add_argument(
|
|
1110
|
+
"--workers", type=int, default=None, help="Number of parallel workers (default: CPU count)"
|
|
1111
|
+
)
|
|
1112
|
+
parser.add_argument(
|
|
1113
|
+
"--no-cache", action="store_true", help="Disable caching of expensive operations"
|
|
1114
|
+
)
|
|
1115
|
+
|
|
1116
|
+
args = parser.parse_args()
|
|
1117
|
+
|
|
1118
|
+
# Validate input file
|
|
1119
|
+
if not os.path.exists(args.pdf_file):
|
|
1120
|
+
print(f"❌ Error: File not found: {args.pdf_file}")
|
|
1121
|
+
sys.exit(1)
|
|
1122
|
+
|
|
1123
|
+
if not args.pdf_file.lower().endswith(".pdf"):
|
|
1124
|
+
print("⚠️ Warning: File does not have .pdf extension")
|
|
1125
|
+
|
|
1126
|
+
# Extract
|
|
1127
|
+
extractor = PDFExtractor(
|
|
1128
|
+
args.pdf_file,
|
|
1129
|
+
verbose=args.verbose,
|
|
1130
|
+
chunk_size=args.chunk_size,
|
|
1131
|
+
min_quality=args.min_quality,
|
|
1132
|
+
extract_images=args.extract_images,
|
|
1133
|
+
image_dir=args.image_dir,
|
|
1134
|
+
min_image_size=args.min_image_size,
|
|
1135
|
+
# Advanced features (Priority 2 & 3)
|
|
1136
|
+
use_ocr=args.ocr,
|
|
1137
|
+
password=args.password,
|
|
1138
|
+
extract_tables=args.extract_tables,
|
|
1139
|
+
parallel=args.parallel,
|
|
1140
|
+
max_workers=args.workers,
|
|
1141
|
+
use_cache=not args.no_cache,
|
|
1142
|
+
)
|
|
1143
|
+
result = extractor.extract_all()
|
|
1144
|
+
|
|
1145
|
+
if result is None:
|
|
1146
|
+
sys.exit(1)
|
|
1147
|
+
|
|
1148
|
+
# Output
|
|
1149
|
+
if args.output:
|
|
1150
|
+
# Save to file
|
|
1151
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
1152
|
+
if args.pretty:
|
|
1153
|
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
1154
|
+
else:
|
|
1155
|
+
json.dump(result, f, ensure_ascii=False)
|
|
1156
|
+
print(f"\n💾 Saved to: {args.output}")
|
|
1157
|
+
else:
|
|
1158
|
+
# Print to stdout
|
|
1159
|
+
if args.pretty:
|
|
1160
|
+
print("\n" + json.dumps(result, indent=2, ensure_ascii=False))
|
|
1161
|
+
else:
|
|
1162
|
+
print(json.dumps(result, ensure_ascii=False))
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
if __name__ == "__main__":
|
|
1166
|
+
main()
|