skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1166 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Text Extractor - Complete Feature Set (Tasks B1.2 + B1.3 + B1.4 + B1.5 + Priority 2 & 3)
4
+
5
+ Extracts text, code blocks, and images from PDF documentation files.
6
+ Uses PyMuPDF (fitz) for fast, high-quality extraction.
7
+
8
+ Features:
9
+ - Text and markdown extraction
10
+ - Code block detection (font, indent, pattern)
11
+ - Language detection with confidence scoring (19+ languages) (B1.4)
12
+ - Syntax validation and quality scoring (B1.4)
13
+ - Quality statistics and filtering (B1.4)
14
+ - Image extraction to files (B1.5)
15
+ - Image filtering by size (B1.5)
16
+ - Page chunking and chapter detection (B1.3)
17
+ - Code block merging across pages (B1.3)
18
+
19
+ Advanced Features (Priority 2 & 3):
20
+ - OCR support for scanned PDFs (requires pytesseract) (Priority 2)
21
+ - Password-protected PDF support (Priority 2)
22
+ - Table extraction (Priority 2)
23
+ - Parallel page processing (Priority 3)
24
+ - Caching of expensive operations (Priority 3)
25
+
26
+ Usage:
27
+ # Basic extraction
28
+ python3 pdf_extractor_poc.py input.pdf
29
+ python3 pdf_extractor_poc.py input.pdf --output output.json
30
+ python3 pdf_extractor_poc.py input.pdf --verbose
31
+
32
+ # Quality filtering
33
+ python3 pdf_extractor_poc.py input.pdf --min-quality 5.0
34
+
35
+ # Image extraction
36
+ python3 pdf_extractor_poc.py input.pdf --extract-images
37
+ python3 pdf_extractor_poc.py input.pdf --extract-images --image-dir images/
38
+
39
+ # Advanced features
40
+ python3 pdf_extractor_poc.py scanned.pdf --ocr
41
+ python3 pdf_extractor_poc.py encrypted.pdf --password mypassword
42
+ python3 pdf_extractor_poc.py input.pdf --extract-tables
43
+ python3 pdf_extractor_poc.py large.pdf --parallel --workers 8
44
+
45
+ Example:
46
+ python3 pdf_extractor_poc.py docs/manual.pdf -o output.json -v \
47
+ --chunk-size 15 --min-quality 6.0 --extract-images \
48
+ --extract-tables --parallel
49
+ """
50
+
51
+ import argparse
52
+ import json
53
+ import os
54
+ import re
55
+ import sys
56
+ from pathlib import Path
57
+
58
+ # Import unified language detector
59
+ from skill_seekers.cli.language_detector import LanguageDetector
60
+
61
+ # Check if PyMuPDF is installed
62
+ try:
63
+ import fitz # PyMuPDF
64
+ except ImportError:
65
+ print("ERROR: PyMuPDF not installed")
66
+ print("Install with: pip install PyMuPDF")
67
+ sys.exit(1)
68
+
69
+ # Optional dependencies for advanced features
70
+ try:
71
+ import pytesseract
72
+ from PIL import Image
73
+
74
+ TESSERACT_AVAILABLE = True
75
+ except ImportError:
76
+ TESSERACT_AVAILABLE = False
77
+
78
+ try:
79
+ import concurrent.futures
80
+
81
+ CONCURRENT_AVAILABLE = True
82
+ except ImportError:
83
+ CONCURRENT_AVAILABLE = False
84
+
85
+
86
+ class PDFExtractor:
87
+ """Extract text and code from PDF documentation"""
88
+
89
+ def __init__(
90
+ self,
91
+ pdf_path,
92
+ verbose=False,
93
+ chunk_size=10,
94
+ min_quality=0.0,
95
+ extract_images=False,
96
+ image_dir=None,
97
+ min_image_size=100,
98
+ use_ocr=False,
99
+ password=None,
100
+ extract_tables=False,
101
+ parallel=False,
102
+ max_workers=None,
103
+ use_cache=True,
104
+ ):
105
+ self.pdf_path = pdf_path
106
+ self.verbose = verbose
107
+ self.chunk_size = chunk_size # Pages per chunk (0 = no chunking)
108
+ self.min_quality = min_quality # Minimum quality score (0-10)
109
+ self.extract_images = extract_images # Extract images to files (NEW in B1.5)
110
+ self.image_dir = image_dir # Directory to save images (NEW in B1.5)
111
+ self.min_image_size = min_image_size # Minimum image dimension (NEW in B1.5)
112
+
113
+ # Advanced features (Priority 2 & 3)
114
+ self.use_ocr = use_ocr # OCR for scanned PDFs (Priority 2)
115
+ self.password = password # Password for encrypted PDFs (Priority 2)
116
+ self.extract_tables = extract_tables # Extract tables (Priority 2)
117
+ self.parallel = parallel # Parallel processing (Priority 3)
118
+ self.max_workers = max_workers or os.cpu_count() # Worker threads (Priority 3)
119
+ self.use_cache = use_cache # Cache expensive operations (Priority 3)
120
+
121
+ self.doc = None
122
+ self.pages = []
123
+ self.chapters = [] # Detected chapters/sections
124
+ self.extracted_images = [] # List of extracted image info (NEW in B1.5)
125
+ self._cache = {} # Cache for expensive operations (Priority 3)
126
+
127
+ # Language detection
128
+ self.language_detector = LanguageDetector(min_confidence=0.15)
129
+
130
+ def log(self, message):
131
+ """Print message if verbose mode enabled"""
132
+ if self.verbose:
133
+ print(message)
134
+
135
+ def extract_text_with_ocr(self, page):
136
+ """
137
+ Extract text from scanned PDF page using OCR (Priority 2).
138
+ Falls back to regular text extraction if OCR is not available.
139
+
140
+ Args:
141
+ page: PyMuPDF page object
142
+
143
+ Returns:
144
+ str: Extracted text
145
+ """
146
+ # Try regular text extraction first
147
+ text = page.get_text("text").strip()
148
+
149
+ # If page has very little text, it might be scanned
150
+ if len(text) < 50 and self.use_ocr:
151
+ if not TESSERACT_AVAILABLE:
152
+ self.log("⚠️ OCR requested but pytesseract not installed")
153
+ self.log(" Install with: pip install pytesseract Pillow")
154
+ return text
155
+
156
+ try:
157
+ # Render page as image
158
+ pix = page.get_pixmap()
159
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
160
+
161
+ # Run OCR
162
+ ocr_text = pytesseract.image_to_string(img)
163
+ self.log(f" OCR extracted {len(ocr_text)} chars (was {len(text)})")
164
+ return ocr_text if len(ocr_text) > len(text) else text
165
+
166
+ except Exception as e:
167
+ self.log(f" OCR failed: {e}")
168
+ return text
169
+
170
+ return text
171
+
172
+ def extract_tables_from_page(self, page):
173
+ """
174
+ Extract tables from PDF page (Priority 2).
175
+ Uses PyMuPDF's table detection.
176
+
177
+ Args:
178
+ page: PyMuPDF page object
179
+
180
+ Returns:
181
+ list: List of extracted tables as dicts
182
+ """
183
+ if not self.extract_tables:
184
+ return []
185
+
186
+ tables = []
187
+ try:
188
+ # PyMuPDF table extraction
189
+ tabs = page.find_tables()
190
+ for idx, tab in enumerate(tabs.tables):
191
+ table_data = {
192
+ "table_index": idx,
193
+ "rows": tab.extract(),
194
+ "bbox": tab.bbox,
195
+ "row_count": len(tab.extract()),
196
+ "col_count": len(tab.extract()[0]) if tab.extract() else 0,
197
+ }
198
+ tables.append(table_data)
199
+ self.log(
200
+ f" Found table {idx}: {table_data['row_count']}x{table_data['col_count']}"
201
+ )
202
+
203
+ except Exception as e:
204
+ self.log(f" Table extraction failed: {e}")
205
+
206
+ return tables
207
+
208
+ def get_cached(self, key):
209
+ """
210
+ Get cached value (Priority 3).
211
+
212
+ Args:
213
+ key: Cache key
214
+
215
+ Returns:
216
+ Cached value or None
217
+ """
218
+ if not self.use_cache:
219
+ return None
220
+ return self._cache.get(key)
221
+
222
+ def set_cached(self, key, value):
223
+ """
224
+ Set cached value (Priority 3).
225
+
226
+ Args:
227
+ key: Cache key
228
+ value: Value to cache
229
+ """
230
+ if self.use_cache:
231
+ self._cache[key] = value
232
+
233
+ def detect_language_from_code(self, code):
234
+ """
235
+ Detect programming language from code content using patterns.
236
+ Enhanced in B1.4 with confidence scoring.
237
+
238
+ UPDATED: Now uses shared LanguageDetector with 20+ languages
239
+
240
+ Returns (language, confidence) tuple
241
+ """
242
+ return self.language_detector.detect_from_code(code)
243
+
244
+ def validate_code_syntax(self, code, language):
245
+ """
246
+ Validate code syntax (basic checks).
247
+ Enhanced in B1.4 with syntax validation.
248
+
249
+ Returns (is_valid, issues) tuple
250
+ """
251
+ issues = []
252
+
253
+ # Common syntax checks
254
+ if not code.strip():
255
+ return False, ["Empty code block"]
256
+
257
+ # Language-specific validation
258
+ if language == "python":
259
+ # Check indentation consistency
260
+ lines = code.split("\n")
261
+ indent_chars = set()
262
+ for line in lines:
263
+ if line.startswith(" "):
264
+ indent_chars.add("space")
265
+ elif line.startswith("\t"):
266
+ indent_chars.add("tab")
267
+
268
+ if len(indent_chars) > 1:
269
+ issues.append("Mixed tabs and spaces")
270
+
271
+ # Check for unclosed brackets/parens
272
+ open_count = code.count("(") + code.count("[") + code.count("{")
273
+ close_count = code.count(")") + code.count("]") + code.count("}")
274
+ if abs(open_count - close_count) > 2: # Allow small mismatch
275
+ issues.append("Unbalanced brackets")
276
+
277
+ elif language in ["javascript", "java", "cpp", "c", "csharp", "go"]:
278
+ # Check for balanced braces
279
+ open_braces = code.count("{")
280
+ close_braces = code.count("}")
281
+ if abs(open_braces - close_braces) > 1:
282
+ issues.append("Unbalanced braces")
283
+
284
+ elif language == "json":
285
+ # Try to parse JSON
286
+ try:
287
+ json.loads(code)
288
+ except (json.JSONDecodeError, ValueError) as e:
289
+ issues.append(f"Invalid JSON syntax: {str(e)[:50]}")
290
+
291
+ # General checks
292
+ # Check if code looks like natural language (too many common words)
293
+ common_words = ["the", "and", "for", "with", "this", "that", "have", "from"]
294
+ word_count = sum(1 for word in common_words if word in code.lower())
295
+ if word_count > 5 and len(code.split()) < 50:
296
+ issues.append("May be natural language, not code")
297
+
298
+ # Check code/comment ratio
299
+ comment_lines = sum(
300
+ 1 for line in code.split("\n") if line.strip().startswith(("#", "//", "/*", "*", "--"))
301
+ )
302
+ total_lines = len([line for line in code.split("\n") if line.strip()])
303
+ if total_lines > 0 and comment_lines / total_lines > 0.7:
304
+ issues.append("Mostly comments")
305
+
306
+ return len(issues) == 0, issues
307
+
308
+ def score_code_quality(self, code, language, confidence):
309
+ """
310
+ Score the quality/usefulness of detected code block.
311
+ New in B1.4.
312
+
313
+ Returns quality score (0-10)
314
+ """
315
+ score = 5.0 # Start with neutral score
316
+
317
+ # Factor 1: Language detection confidence
318
+ score += confidence * 2.0
319
+
320
+ # Factor 2: Code length (not too short, not too long)
321
+ code_length = len(code.strip())
322
+ if 20 <= code_length <= 500:
323
+ score += 1.0
324
+ elif 500 < code_length <= 2000:
325
+ score += 0.5
326
+ elif code_length < 10:
327
+ score -= 2.0
328
+
329
+ # Factor 3: Number of lines
330
+ lines = [line for line in code.split("\n") if line.strip()]
331
+ if 2 <= len(lines) <= 50:
332
+ score += 1.0
333
+ elif len(lines) > 100:
334
+ score -= 1.0
335
+
336
+ # Factor 4: Has function/class definitions
337
+ if re.search(r"\b(def|function|class|func|fn|public class)\b", code):
338
+ score += 1.5
339
+
340
+ # Factor 5: Has meaningful variable names (not just x, y, i)
341
+ meaningful_vars = re.findall(r"\b[a-z_][a-z0-9_]{3,}\b", code.lower())
342
+ if len(meaningful_vars) >= 2:
343
+ score += 1.0
344
+
345
+ # Factor 6: Syntax validation
346
+ is_valid, issues = self.validate_code_syntax(code, language)
347
+ if is_valid:
348
+ score += 1.0
349
+ else:
350
+ score -= len(issues) * 0.5
351
+
352
+ # Clamp score to 0-10 range
353
+ return max(0, min(10, score))
354
+
355
+ def detect_code_blocks_by_font(self, page):
356
+ """
357
+ Detect code blocks by analyzing font properties.
358
+ Monospace fonts typically indicate code.
359
+
360
+ Returns list of detected code blocks with metadata.
361
+ """
362
+ code_blocks = []
363
+ blocks = page.get_text("dict")["blocks"]
364
+
365
+ monospace_fonts = ["courier", "mono", "consolas", "menlo", "monaco", "dejavu"]
366
+
367
+ current_code = []
368
+ current_font = None
369
+
370
+ for block in blocks:
371
+ if "lines" not in block:
372
+ continue
373
+
374
+ for line in block["lines"]:
375
+ for span in line["spans"]:
376
+ font = span["font"].lower()
377
+ text = span["text"]
378
+
379
+ # Check if font is monospace
380
+ is_monospace = any(mf in font for mf in monospace_fonts)
381
+
382
+ if is_monospace:
383
+ # Accumulate code text
384
+ current_code.append(text)
385
+ current_font = span["font"]
386
+ else:
387
+ # End of code block
388
+ if current_code:
389
+ code_text = "".join(current_code).strip()
390
+ if len(code_text) > 10: # Minimum code length
391
+ lang, confidence = self.detect_language_from_code(code_text)
392
+ quality = self.score_code_quality(code_text, lang, confidence)
393
+ is_valid, issues = self.validate_code_syntax(code_text, lang)
394
+
395
+ code_blocks.append(
396
+ {
397
+ "code": code_text,
398
+ "language": lang,
399
+ "confidence": confidence,
400
+ "quality_score": quality,
401
+ "is_valid": is_valid,
402
+ "validation_issues": issues if not is_valid else [],
403
+ "font": current_font,
404
+ "detection_method": "font",
405
+ }
406
+ )
407
+ current_code = []
408
+ current_font = None
409
+
410
+ # Handle final code block
411
+ if current_code:
412
+ code_text = "".join(current_code).strip()
413
+ if len(code_text) > 10:
414
+ lang, confidence = self.detect_language_from_code(code_text)
415
+ quality = self.score_code_quality(code_text, lang, confidence)
416
+ is_valid, issues = self.validate_code_syntax(code_text, lang)
417
+
418
+ code_blocks.append(
419
+ {
420
+ "code": code_text,
421
+ "language": lang,
422
+ "confidence": confidence,
423
+ "quality_score": quality,
424
+ "is_valid": is_valid,
425
+ "validation_issues": issues if not is_valid else [],
426
+ "font": current_font,
427
+ "detection_method": "font",
428
+ }
429
+ )
430
+
431
+ return code_blocks
432
+
433
+ def detect_code_blocks_by_indent(self, text):
434
+ """
435
+ Detect code blocks by indentation patterns.
436
+ Code often has consistent indentation.
437
+
438
+ Returns list of detected code blocks.
439
+ """
440
+ code_blocks = []
441
+ lines = text.split("\n")
442
+ current_block = []
443
+ indent_pattern = None
444
+
445
+ for line in lines:
446
+ # Check for indentation (4 spaces or tab)
447
+ if line.startswith(" ") or line.startswith("\t"):
448
+ # Start or continue code block
449
+ if not indent_pattern:
450
+ indent_pattern = line[:4] if line.startswith(" ") else "\t"
451
+ current_block.append(line)
452
+ else:
453
+ # End of code block
454
+ if current_block and len(current_block) >= 2: # At least 2 lines
455
+ code_text = "\n".join(current_block).strip()
456
+ if len(code_text) > 20: # Minimum code length
457
+ lang, confidence = self.detect_language_from_code(code_text)
458
+ quality = self.score_code_quality(code_text, lang, confidence)
459
+ is_valid, issues = self.validate_code_syntax(code_text, lang)
460
+
461
+ code_blocks.append(
462
+ {
463
+ "code": code_text,
464
+ "language": lang,
465
+ "confidence": confidence,
466
+ "quality_score": quality,
467
+ "is_valid": is_valid,
468
+ "validation_issues": issues if not is_valid else [],
469
+ "detection_method": "indent",
470
+ }
471
+ )
472
+ current_block = []
473
+ indent_pattern = None
474
+
475
+ # Handle final block
476
+ if current_block and len(current_block) >= 2:
477
+ code_text = "\n".join(current_block).strip()
478
+ if len(code_text) > 20:
479
+ lang, confidence = self.detect_language_from_code(code_text)
480
+ quality = self.score_code_quality(code_text, lang, confidence)
481
+ is_valid, issues = self.validate_code_syntax(code_text, lang)
482
+
483
+ code_blocks.append(
484
+ {
485
+ "code": code_text,
486
+ "language": lang,
487
+ "confidence": confidence,
488
+ "quality_score": quality,
489
+ "is_valid": is_valid,
490
+ "validation_issues": issues if not is_valid else [],
491
+ "detection_method": "indent",
492
+ }
493
+ )
494
+
495
+ return code_blocks
496
+
497
+ def detect_code_blocks_by_pattern(self, text):
498
+ """
499
+ Detect code blocks by common code patterns (keywords, syntax).
500
+
501
+ Returns list of detected code snippets.
502
+ """
503
+ code_blocks = []
504
+
505
+ # Common code patterns that span multiple lines
506
+ patterns = [
507
+ # Function definitions
508
+ (
509
+ r"((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)",
510
+ "function",
511
+ ),
512
+ # Class definitions
513
+ (r"(class\s+\w+[^{]*\{[^}]*\})", "class"),
514
+ # Import statements block
515
+ (
516
+ r"((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)",
517
+ "imports",
518
+ ),
519
+ ]
520
+
521
+ for pattern, block_type in patterns:
522
+ matches = re.finditer(pattern, text, re.MULTILINE | re.DOTALL)
523
+ for match in matches:
524
+ code_text = match.group(1).strip()
525
+ if len(code_text) > 15:
526
+ lang, confidence = self.detect_language_from_code(code_text)
527
+ quality = self.score_code_quality(code_text, lang, confidence)
528
+ is_valid, issues = self.validate_code_syntax(code_text, lang)
529
+
530
+ code_blocks.append(
531
+ {
532
+ "code": code_text,
533
+ "language": lang,
534
+ "confidence": confidence,
535
+ "quality_score": quality,
536
+ "is_valid": is_valid,
537
+ "validation_issues": issues if not is_valid else [],
538
+ "detection_method": "pattern",
539
+ "pattern_type": block_type,
540
+ }
541
+ )
542
+
543
+ return code_blocks
544
+
545
+ def detect_chapter_start(self, page_data):
546
+ """
547
+ Detect if a page starts a new chapter/section.
548
+
549
+ Returns (is_chapter_start, chapter_title) tuple.
550
+ """
551
+ headings = page_data.get("headings", [])
552
+
553
+ # Check for h1 or h2 at start of page
554
+ if headings:
555
+ first_heading = headings[0]
556
+ # H1 headings are strong indicators of chapters
557
+ if first_heading["level"] in ["h1", "h2"]:
558
+ return True, first_heading["text"]
559
+
560
+ # Check for specific chapter markers in text
561
+ text = page_data.get("text", "")
562
+ first_line = text.split("\n")[0] if text else ""
563
+
564
+ chapter_patterns = [
565
+ r"^Chapter\s+\d+",
566
+ r"^Part\s+\d+",
567
+ r"^Section\s+\d+",
568
+ r"^\d+\.\s+[A-Z]", # "1. Introduction"
569
+ ]
570
+
571
+ for pattern in chapter_patterns:
572
+ if re.match(pattern, first_line, re.IGNORECASE):
573
+ return True, first_line.strip()
574
+
575
+ return False, None
576
+
577
+ def merge_continued_code_blocks(self, pages):
578
+ """
579
+ Merge code blocks that are split across pages.
580
+
581
+ Detects when a code block at the end of one page continues
582
+ on the next page.
583
+ """
584
+ for i in range(len(pages) - 1):
585
+ current_page = pages[i]
586
+ next_page = pages[i + 1]
587
+
588
+ # Check if current page has code blocks
589
+ if not current_page["code_samples"]:
590
+ continue
591
+
592
+ # Get last code block of current page
593
+ last_code = current_page["code_samples"][-1]
594
+
595
+ # Check if next page starts with code
596
+ if not next_page["code_samples"]:
597
+ continue
598
+
599
+ first_next_code = next_page["code_samples"][0]
600
+
601
+ # Same language and detection method = likely continuation
602
+ if (
603
+ last_code["language"] == first_next_code["language"]
604
+ and last_code["detection_method"] == first_next_code["detection_method"]
605
+ ):
606
+ # Check if last code block looks incomplete (doesn't end with closing brace/etc)
607
+ last_code_text = last_code["code"].rstrip()
608
+ continuation_indicators = [
609
+ not last_code_text.endswith("}"),
610
+ not last_code_text.endswith(";"),
611
+ last_code_text.endswith(","),
612
+ last_code_text.endswith("\\"),
613
+ ]
614
+
615
+ if any(continuation_indicators):
616
+ # Merge the code blocks
617
+ merged_code = last_code["code"] + "\n" + first_next_code["code"]
618
+ last_code["code"] = merged_code
619
+ last_code["merged_from_next_page"] = True
620
+
621
+ # Remove the first code block from next page
622
+ next_page["code_samples"].pop(0)
623
+ next_page["code_blocks_count"] -= 1
624
+
625
+ self.log(f" Merged code block from page {i + 1} to {i + 2}")
626
+
627
+ return pages
628
+
629
+ def create_chunks(self, pages):
630
+ """
631
+ Create chunks of pages for better organization.
632
+
633
+ Returns array of chunks, each containing:
634
+ - chunk_number
635
+ - start_page, end_page
636
+ - pages (array)
637
+ - chapter_title (if detected)
638
+ """
639
+ if self.chunk_size == 0:
640
+ # No chunking - return all pages as one chunk
641
+ return [
642
+ {
643
+ "chunk_number": 1,
644
+ "start_page": 1,
645
+ "end_page": len(pages),
646
+ "pages": pages,
647
+ "chapter_title": None,
648
+ }
649
+ ]
650
+
651
+ chunks = []
652
+ current_chunk = []
653
+ chunk_start = 0
654
+ current_chapter = None
655
+
656
+ for i, page in enumerate(pages):
657
+ # Check if this page starts a new chapter
658
+ is_chapter, chapter_title = self.detect_chapter_start(page)
659
+
660
+ if is_chapter and current_chunk:
661
+ # Save current chunk before starting new one
662
+ chunks.append(
663
+ {
664
+ "chunk_number": len(chunks) + 1,
665
+ "start_page": chunk_start + 1,
666
+ "end_page": i,
667
+ "pages": current_chunk,
668
+ "chapter_title": current_chapter,
669
+ }
670
+ )
671
+ current_chunk = []
672
+ chunk_start = i
673
+ current_chapter = chapter_title
674
+
675
+ if not current_chapter and is_chapter:
676
+ current_chapter = chapter_title
677
+
678
+ current_chunk.append(page)
679
+
680
+ # Check if chunk size reached (but don't break chapters)
681
+ if not is_chapter and len(current_chunk) >= self.chunk_size:
682
+ chunks.append(
683
+ {
684
+ "chunk_number": len(chunks) + 1,
685
+ "start_page": chunk_start + 1,
686
+ "end_page": i + 1,
687
+ "pages": current_chunk,
688
+ "chapter_title": current_chapter,
689
+ }
690
+ )
691
+ current_chunk = []
692
+ chunk_start = i + 1
693
+ current_chapter = None
694
+
695
+ # Add remaining pages as final chunk
696
+ if current_chunk:
697
+ chunks.append(
698
+ {
699
+ "chunk_number": len(chunks) + 1,
700
+ "start_page": chunk_start + 1,
701
+ "end_page": len(pages),
702
+ "pages": current_chunk,
703
+ "chapter_title": current_chapter,
704
+ }
705
+ )
706
+
707
+ return chunks
708
+
709
+ def extract_images_from_page(self, page, page_num):
710
+ """
711
+ Extract images from a PDF page and save to disk (NEW in B1.5).
712
+
713
+ Returns list of extracted image metadata.
714
+ """
715
+ if not self.extract_images:
716
+ # Just count images, don't extract
717
+ return []
718
+
719
+ extracted = []
720
+ image_list = page.get_images()
721
+
722
+ for img_index, img in enumerate(image_list):
723
+ try:
724
+ xref = img[0] # Image XREF number
725
+ base_image = self.doc.extract_image(xref)
726
+
727
+ if not base_image:
728
+ continue
729
+
730
+ image_bytes = base_image["image"]
731
+ image_ext = base_image["ext"] # png, jpeg, etc.
732
+ width = base_image.get("width", 0)
733
+ height = base_image.get("height", 0)
734
+
735
+ # Filter out small images (icons, bullets, etc.)
736
+ if width < self.min_image_size or height < self.min_image_size:
737
+ self.log(f" Skipping small image: {width}x{height}")
738
+ continue
739
+
740
+ # Generate filename
741
+ pdf_basename = Path(self.pdf_path).stem
742
+ image_filename = f"{pdf_basename}_page{page_num + 1}_img{img_index + 1}.{image_ext}"
743
+
744
+ # Save image
745
+ image_path = Path(self.image_dir) / image_filename
746
+ image_path.parent.mkdir(parents=True, exist_ok=True)
747
+
748
+ with open(image_path, "wb") as f:
749
+ f.write(image_bytes)
750
+
751
+ # Store metadata
752
+ image_info = {
753
+ "filename": image_filename,
754
+ "path": str(image_path),
755
+ "page_number": page_num + 1,
756
+ "width": width,
757
+ "height": height,
758
+ "format": image_ext,
759
+ "size_bytes": len(image_bytes),
760
+ "xref": xref,
761
+ }
762
+
763
+ extracted.append(image_info)
764
+ self.extracted_images.append(image_info)
765
+ self.log(f" Extracted image: {image_filename} ({width}x{height})")
766
+
767
+ except Exception as e:
768
+ self.log(f" Error extracting image {img_index}: {e}")
769
+ continue
770
+
771
+ return extracted
772
+
773
+ def extract_page(self, page_num):
774
+ """
775
+ Extract content from a single PDF page.
776
+
777
+ Returns dict with page content, code blocks, and metadata.
778
+ """
779
+ # Check cache first (Priority 3)
780
+ cache_key = f"page_{page_num}"
781
+ cached = self.get_cached(cache_key)
782
+ if cached is not None:
783
+ self.log(f" Page {page_num + 1}: Using cached data")
784
+ return cached
785
+
786
+ page = self.doc.load_page(page_num)
787
+
788
+ # Extract plain text (with OCR if enabled - Priority 2)
789
+ text = self.extract_text_with_ocr(page) if self.use_ocr else page.get_text("text")
790
+
791
+ # Extract markdown (better structure preservation)
792
+ markdown = page.get_text("markdown")
793
+
794
+ # Extract tables (Priority 2)
795
+ tables = self.extract_tables_from_page(page)
796
+
797
+ # Get page images (for diagrams)
798
+ images = page.get_images()
799
+
800
+ # Extract images to files (NEW in B1.5)
801
+ extracted_images = self.extract_images_from_page(page, page_num)
802
+
803
+ # Detect code blocks using multiple methods
804
+ font_code_blocks = self.detect_code_blocks_by_font(page)
805
+ indent_code_blocks = self.detect_code_blocks_by_indent(text)
806
+ pattern_code_blocks = self.detect_code_blocks_by_pattern(text)
807
+
808
+ # Merge and deduplicate code blocks
809
+ all_code_blocks = font_code_blocks + indent_code_blocks + pattern_code_blocks
810
+
811
+ # Simple deduplication by code content
812
+ unique_code = {}
813
+ for block in all_code_blocks:
814
+ code_hash = hash(block["code"])
815
+ if code_hash not in unique_code:
816
+ unique_code[code_hash] = block
817
+ else:
818
+ # Keep the one with higher quality score
819
+ if block["quality_score"] > unique_code[code_hash]["quality_score"]:
820
+ unique_code[code_hash] = block
821
+
822
+ code_samples = list(unique_code.values())
823
+
824
+ # Filter by minimum quality (NEW in B1.4)
825
+ if self.min_quality > 0:
826
+ code_samples_before = len(code_samples)
827
+ code_samples = [c for c in code_samples if c["quality_score"] >= self.min_quality]
828
+ filtered_count = code_samples_before - len(code_samples)
829
+ if filtered_count > 0:
830
+ self.log(
831
+ f" Filtered out {filtered_count} low-quality code blocks (min_quality={self.min_quality})"
832
+ )
833
+
834
+ # Sort by quality score (highest first)
835
+ code_samples.sort(key=lambda x: x["quality_score"], reverse=True)
836
+
837
+ # Extract headings from markdown
838
+ headings = []
839
+ for line in markdown.split("\n"):
840
+ if line.startswith("#"):
841
+ level = len(line) - len(line.lstrip("#"))
842
+ text = line.lstrip("#").strip()
843
+ if text:
844
+ headings.append({"level": f"h{level}", "text": text})
845
+
846
+ page_data = {
847
+ "page_number": page_num + 1, # 1-indexed for humans
848
+ "text": text.strip(),
849
+ "markdown": markdown.strip(),
850
+ "headings": headings,
851
+ "code_samples": code_samples,
852
+ "images_count": len(images),
853
+ "extracted_images": extracted_images, # NEW in B1.5
854
+ "tables": tables, # NEW in Priority 2
855
+ "char_count": len(text),
856
+ "code_blocks_count": len(code_samples),
857
+ "tables_count": len(tables), # NEW in Priority 2
858
+ }
859
+
860
+ # Cache the result (Priority 3)
861
+ self.set_cached(cache_key, page_data)
862
+
863
+ self.log(
864
+ f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables"
865
+ )
866
+
867
+ return page_data
868
+
869
+ def extract_all(self):
870
+ """
871
+ Extract content from all pages of the PDF.
872
+ Enhanced with password support and parallel processing.
873
+
874
+ Returns dict with metadata and pages array.
875
+ """
876
+ print(f"\n📄 Extracting from: {self.pdf_path}")
877
+
878
+ # Open PDF (with password support - Priority 2)
879
+ try:
880
+ self.doc = fitz.open(self.pdf_path)
881
+
882
+ # Handle encrypted PDFs (Priority 2)
883
+ if self.doc.is_encrypted:
884
+ if self.password:
885
+ print(" 🔐 PDF is encrypted, trying password...")
886
+ if self.doc.authenticate(self.password):
887
+ print(" ✅ Password accepted")
888
+ else:
889
+ print(" ❌ Invalid password")
890
+ return None
891
+ else:
892
+ print(" ❌ PDF is encrypted but no password provided")
893
+ print(" Use --password option to provide password")
894
+ return None
895
+
896
+ except Exception as e:
897
+ print(f"❌ Error opening PDF: {e}")
898
+ return None
899
+
900
+ print(f" Pages: {len(self.doc)}")
901
+ print(f" Metadata: {self.doc.metadata}")
902
+
903
+ # Set up image directory (NEW in B1.5)
904
+ if self.extract_images and not self.image_dir:
905
+ pdf_basename = Path(self.pdf_path).stem
906
+ self.image_dir = f"output/{pdf_basename}_images"
907
+ print(f" Image directory: {self.image_dir}")
908
+
909
+ # Show feature status
910
+ if self.use_ocr:
911
+ status = (
912
+ "✅ enabled" if TESSERACT_AVAILABLE else "⚠️ not available (install pytesseract)"
913
+ )
914
+ print(f" OCR: {status}")
915
+ if self.extract_tables:
916
+ print(" Table extraction: ✅ enabled")
917
+ if self.parallel:
918
+ status = "✅ enabled" if CONCURRENT_AVAILABLE else "⚠️ not available"
919
+ print(f" Parallel processing: {status} ({self.max_workers} workers)")
920
+ if self.use_cache:
921
+ print(" Caching: ✅ enabled")
922
+
923
+ print("")
924
+
925
+ # Extract each page (with parallel processing - Priority 3)
926
+ if self.parallel and CONCURRENT_AVAILABLE and len(self.doc) > 5:
927
+ print(
928
+ f"🚀 Extracting {len(self.doc)} pages in parallel ({self.max_workers} workers)..."
929
+ )
930
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
931
+ page_numbers = list(range(len(self.doc)))
932
+ self.pages = list(executor.map(self.extract_page, page_numbers))
933
+ else:
934
+ # Sequential extraction
935
+ for page_num in range(len(self.doc)):
936
+ page_data = self.extract_page(page_num)
937
+ self.pages.append(page_data)
938
+
939
+ # Merge code blocks that span across pages
940
+ self.log("\n🔗 Merging code blocks across pages...")
941
+ self.pages = self.merge_continued_code_blocks(self.pages)
942
+
943
+ # Create chunks
944
+ self.log(f"\n📦 Creating chunks (chunk_size={self.chunk_size})...")
945
+ chunks = self.create_chunks(self.pages)
946
+
947
+ # Build summary
948
+ total_chars = sum(p["char_count"] for p in self.pages)
949
+ total_code_blocks = sum(p["code_blocks_count"] for p in self.pages)
950
+ total_headings = sum(len(p["headings"]) for p in self.pages)
951
+ total_images = sum(p["images_count"] for p in self.pages)
952
+ total_tables = sum(p["tables_count"] for p in self.pages) # NEW in Priority 2
953
+
954
+ # Detect languages used
955
+ languages = {}
956
+ all_code_blocks_list = []
957
+ for page in self.pages:
958
+ for code in page["code_samples"]:
959
+ lang = code["language"]
960
+ languages[lang] = languages.get(lang, 0) + 1
961
+ all_code_blocks_list.append(code)
962
+
963
+ # Calculate quality statistics (NEW in B1.4)
964
+ quality_stats = {}
965
+ if all_code_blocks_list:
966
+ quality_scores = [c["quality_score"] for c in all_code_blocks_list]
967
+ confidences = [c["confidence"] for c in all_code_blocks_list]
968
+ valid_count = sum(1 for c in all_code_blocks_list if c["is_valid"])
969
+
970
+ quality_stats = {
971
+ "average_quality": sum(quality_scores) / len(quality_scores),
972
+ "average_confidence": sum(confidences) / len(confidences),
973
+ "valid_code_blocks": valid_count,
974
+ "invalid_code_blocks": total_code_blocks - valid_count,
975
+ "validation_rate": valid_count / total_code_blocks if total_code_blocks > 0 else 0,
976
+ "high_quality_blocks": sum(1 for s in quality_scores if s >= 7.0),
977
+ "medium_quality_blocks": sum(1 for s in quality_scores if 4.0 <= s < 7.0),
978
+ "low_quality_blocks": sum(1 for s in quality_scores if s < 4.0),
979
+ }
980
+
981
+ # Extract chapter information
982
+ chapters = []
983
+ for chunk in chunks:
984
+ if chunk["chapter_title"]:
985
+ chapters.append(
986
+ {
987
+ "title": chunk["chapter_title"],
988
+ "start_page": chunk["start_page"],
989
+ "end_page": chunk["end_page"],
990
+ }
991
+ )
992
+
993
+ result = {
994
+ "source_file": self.pdf_path,
995
+ "metadata": self.doc.metadata,
996
+ "total_pages": len(self.doc),
997
+ "total_chars": total_chars,
998
+ "total_code_blocks": total_code_blocks,
999
+ "total_headings": total_headings,
1000
+ "total_images": total_images,
1001
+ "total_extracted_images": len(self.extracted_images), # NEW in B1.5
1002
+ "total_tables": total_tables, # NEW in Priority 2
1003
+ "image_directory": self.image_dir if self.extract_images else None, # NEW in B1.5
1004
+ "extracted_images": self.extracted_images, # NEW in B1.5
1005
+ "total_chunks": len(chunks),
1006
+ "chapters": chapters,
1007
+ "languages_detected": languages,
1008
+ "quality_statistics": quality_stats, # NEW in B1.4
1009
+ "chunks": chunks,
1010
+ "pages": self.pages, # Still include all pages for compatibility
1011
+ }
1012
+
1013
+ # Close document
1014
+ self.doc.close()
1015
+
1016
+ print("\n✅ Extraction complete:")
1017
+ print(f" Total characters: {total_chars:,}")
1018
+ print(f" Code blocks found: {total_code_blocks}")
1019
+ print(f" Headings found: {total_headings}")
1020
+ print(f" Images found: {total_images}")
1021
+ if self.extract_images:
1022
+ print(f" Images extracted: {len(self.extracted_images)}")
1023
+ if self.image_dir:
1024
+ print(f" Image directory: {self.image_dir}")
1025
+ if self.extract_tables:
1026
+ print(f" Tables found: {total_tables}")
1027
+ print(f" Chunks created: {len(chunks)}")
1028
+ print(f" Chapters detected: {len(chapters)}")
1029
+ print(f" Languages detected: {', '.join(languages.keys())}")
1030
+
1031
+ # Print quality statistics (NEW in B1.4)
1032
+ if quality_stats:
1033
+ print("\n📊 Code Quality Statistics:")
1034
+ print(f" Average quality: {quality_stats['average_quality']:.1f}/10")
1035
+ print(f" Average confidence: {quality_stats['average_confidence']:.1%}")
1036
+ print(
1037
+ f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})"
1038
+ )
1039
+ print(f" High quality (7+): {quality_stats['high_quality_blocks']}")
1040
+ print(f" Medium quality (4-7): {quality_stats['medium_quality_blocks']}")
1041
+ print(f" Low quality (<4): {quality_stats['low_quality_blocks']}")
1042
+
1043
+ return result
1044
+
1045
+
1046
+ def main():
1047
+ parser = argparse.ArgumentParser(
1048
+ description="Extract text and code blocks from PDF documentation",
1049
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1050
+ epilog="""
1051
+ Examples:
1052
+ # Extract from PDF
1053
+ python3 pdf_extractor_poc.py input.pdf
1054
+
1055
+ # Save to JSON file
1056
+ python3 pdf_extractor_poc.py input.pdf --output result.json
1057
+
1058
+ # Verbose mode
1059
+ python3 pdf_extractor_poc.py input.pdf --verbose
1060
+
1061
+ # Extract and save
1062
+ python3 pdf_extractor_poc.py docs/python.pdf -o python_extracted.json -v
1063
+ """,
1064
+ )
1065
+
1066
+ parser.add_argument("pdf_file", help="Path to PDF file to extract")
1067
+ parser.add_argument("-o", "--output", help="Output JSON file path (default: print to stdout)")
1068
+ parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
1069
+ parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
1070
+ parser.add_argument(
1071
+ "--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)"
1072
+ )
1073
+ parser.add_argument(
1074
+ "--no-merge", action="store_true", help="Disable merging code blocks across pages"
1075
+ )
1076
+ parser.add_argument(
1077
+ "--min-quality",
1078
+ type=float,
1079
+ default=0.0,
1080
+ help="Minimum code quality score (0-10, default: 0 = no filtering)",
1081
+ )
1082
+ parser.add_argument(
1083
+ "--extract-images", action="store_true", help="Extract images to files (NEW in B1.5)"
1084
+ )
1085
+ parser.add_argument(
1086
+ "--image-dir",
1087
+ type=str,
1088
+ default=None,
1089
+ help="Directory to save extracted images (default: output/{pdf_name}_images)",
1090
+ )
1091
+ parser.add_argument(
1092
+ "--min-image-size",
1093
+ type=int,
1094
+ default=100,
1095
+ help="Minimum image dimension in pixels (filters icons, default: 100)",
1096
+ )
1097
+
1098
+ # Advanced features (Priority 2 & 3)
1099
+ parser.add_argument(
1100
+ "--ocr", action="store_true", help="Use OCR for scanned PDFs (requires pytesseract)"
1101
+ )
1102
+ parser.add_argument("--password", type=str, default=None, help="Password for encrypted PDF")
1103
+ parser.add_argument(
1104
+ "--extract-tables", action="store_true", help="Extract tables from PDF (Priority 2)"
1105
+ )
1106
+ parser.add_argument(
1107
+ "--parallel", action="store_true", help="Process pages in parallel (Priority 3)"
1108
+ )
1109
+ parser.add_argument(
1110
+ "--workers", type=int, default=None, help="Number of parallel workers (default: CPU count)"
1111
+ )
1112
+ parser.add_argument(
1113
+ "--no-cache", action="store_true", help="Disable caching of expensive operations"
1114
+ )
1115
+
1116
+ args = parser.parse_args()
1117
+
1118
+ # Validate input file
1119
+ if not os.path.exists(args.pdf_file):
1120
+ print(f"❌ Error: File not found: {args.pdf_file}")
1121
+ sys.exit(1)
1122
+
1123
+ if not args.pdf_file.lower().endswith(".pdf"):
1124
+ print("⚠️ Warning: File does not have .pdf extension")
1125
+
1126
+ # Extract
1127
+ extractor = PDFExtractor(
1128
+ args.pdf_file,
1129
+ verbose=args.verbose,
1130
+ chunk_size=args.chunk_size,
1131
+ min_quality=args.min_quality,
1132
+ extract_images=args.extract_images,
1133
+ image_dir=args.image_dir,
1134
+ min_image_size=args.min_image_size,
1135
+ # Advanced features (Priority 2 & 3)
1136
+ use_ocr=args.ocr,
1137
+ password=args.password,
1138
+ extract_tables=args.extract_tables,
1139
+ parallel=args.parallel,
1140
+ max_workers=args.workers,
1141
+ use_cache=not args.no_cache,
1142
+ )
1143
+ result = extractor.extract_all()
1144
+
1145
+ if result is None:
1146
+ sys.exit(1)
1147
+
1148
+ # Output
1149
+ if args.output:
1150
+ # Save to file
1151
+ with open(args.output, "w", encoding="utf-8") as f:
1152
+ if args.pretty:
1153
+ json.dump(result, f, indent=2, ensure_ascii=False)
1154
+ else:
1155
+ json.dump(result, f, ensure_ascii=False)
1156
+ print(f"\n💾 Saved to: {args.output}")
1157
+ else:
1158
+ # Print to stdout
1159
+ if args.pretty:
1160
+ print("\n" + json.dumps(result, indent=2, ensure_ascii=False))
1161
+ else:
1162
+ print(json.dumps(result, ensure_ascii=False))
1163
+
1164
+
1165
+ if __name__ == "__main__":
1166
+ main()