@techwavedev/agi-agent-kit 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -0
- package/README.md +147 -0
- package/bin/init.js +471 -0
- package/package.json +36 -0
- package/templates/.agent/agents/backend-specialist.md +263 -0
- package/templates/.agent/agents/code-archaeologist.md +106 -0
- package/templates/.agent/agents/database-architect.md +226 -0
- package/templates/.agent/agents/debugger.md +225 -0
- package/templates/.agent/agents/devops-engineer.md +242 -0
- package/templates/.agent/agents/documentation-writer.md +104 -0
- package/templates/.agent/agents/explorer-agent.md +73 -0
- package/templates/.agent/agents/frontend-specialist.md +556 -0
- package/templates/.agent/agents/game-developer.md +162 -0
- package/templates/.agent/agents/mobile-developer.md +377 -0
- package/templates/.agent/agents/orchestrator.md +416 -0
- package/templates/.agent/agents/penetration-tester.md +188 -0
- package/templates/.agent/agents/performance-optimizer.md +187 -0
- package/templates/.agent/agents/product-manager.md +112 -0
- package/templates/.agent/agents/project-planner.md +403 -0
- package/templates/.agent/agents/qa-automation-engineer.md +109 -0
- package/templates/.agent/agents/security-auditor.md +170 -0
- package/templates/.agent/agents/seo-specialist.md +111 -0
- package/templates/.agent/agents/test-engineer.md +158 -0
- package/templates/.agent/rules/GEMINI.md +253 -0
- package/templates/.agent/workflows/brainstorm.md +113 -0
- package/templates/.agent/workflows/create.md +59 -0
- package/templates/.agent/workflows/debug.md +103 -0
- package/templates/.agent/workflows/deploy.md +176 -0
- package/templates/.agent/workflows/enhance.md +63 -0
- package/templates/.agent/workflows/orchestrate.md +237 -0
- package/templates/.agent/workflows/plan.md +89 -0
- package/templates/.agent/workflows/preview.md +81 -0
- package/templates/.agent/workflows/status.md +86 -0
- package/templates/.agent/workflows/test.md +144 -0
- package/templates/.agent/workflows/ui-ux-pro-max.md +296 -0
- package/templates/base/.env.example +54 -0
- package/templates/base/AGENTS.md +463 -0
- package/templates/base/requirements.txt +6 -0
- package/templates/base/skill-creator/LICENSE.txt +202 -0
- package/templates/base/skill-creator/SKILL_skillcreator.md +389 -0
- package/templates/base/skill-creator/references/output-patterns.md +82 -0
- package/templates/base/skill-creator/references/workflows.md +28 -0
- package/templates/base/skill-creator/scripts/init_skill.py +304 -0
- package/templates/base/skill-creator/scripts/package_skill.py +110 -0
- package/templates/base/skill-creator/scripts/quick_validate.py +95 -0
- package/templates/base/skill-creator/scripts/update_catalog.py +371 -0
- package/templates/skills/core/README.md +21 -0
- package/templates/skills/core/documentation/SKILL.md +351 -0
- package/templates/skills/core/documentation/references/best_practices.md +201 -0
- package/templates/skills/core/documentation/scripts/analyze_code.py +307 -0
- package/templates/skills/core/documentation/scripts/detect_changes.py +460 -0
- package/templates/skills/core/documentation/scripts/generate_changelog.py +312 -0
- package/templates/skills/core/documentation/scripts/sync_docs.py +272 -0
- package/templates/skills/core/documentation/scripts/update_skill_docs.py +366 -0
- package/templates/skills/core/pdf-reader/SKILL.md +104 -0
- package/templates/skills/core/pdf-reader/references/pdf_libraries.md +83 -0
- package/templates/skills/core/pdf-reader/scripts/extract_text.py +295 -0
- package/templates/skills/core/qdrant-memory/SKILL.md +435 -0
- package/templates/skills/core/qdrant-memory/references/advanced_patterns.md +375 -0
- package/templates/skills/core/qdrant-memory/references/collection_schemas.md +229 -0
- package/templates/skills/core/qdrant-memory/references/complete_guide.md +724 -0
- package/templates/skills/core/qdrant-memory/references/embedding_models.md +325 -0
- package/templates/skills/core/qdrant-memory/scripts/benchmark_token_savings.py +640 -0
- package/templates/skills/core/qdrant-memory/scripts/embedding_utils.py +323 -0
- package/templates/skills/core/qdrant-memory/scripts/hybrid_search.py +214 -0
- package/templates/skills/core/qdrant-memory/scripts/init_collection.py +193 -0
- package/templates/skills/core/qdrant-memory/scripts/memory_retrieval.py +345 -0
- package/templates/skills/core/qdrant-memory/scripts/semantic_cache.py +282 -0
- package/templates/skills/core/qdrant-memory/scripts/test_skill.py +655 -0
- package/templates/skills/core/webcrawler/SKILL.md +292 -0
- package/templates/skills/core/webcrawler/references/advanced_crawling.md +181 -0
- package/templates/skills/core/webcrawler/scripts/crawl_docs.py +532 -0
- package/templates/skills/core/webcrawler/scripts/extract_page.py +189 -0
- package/templates/skills/core/webcrawler/scripts/filter_docs.py +200 -0
- package/templates/skills/knowledge/api-patterns/SKILL.md +81 -0
- package/templates/skills/knowledge/api-patterns/api-style.md +42 -0
- package/templates/skills/knowledge/api-patterns/auth.md +24 -0
- package/templates/skills/knowledge/api-patterns/documentation.md +26 -0
- package/templates/skills/knowledge/api-patterns/graphql.md +41 -0
- package/templates/skills/knowledge/api-patterns/rate-limiting.md +31 -0
- package/templates/skills/knowledge/api-patterns/response.md +37 -0
- package/templates/skills/knowledge/api-patterns/rest.md +40 -0
- package/templates/skills/knowledge/api-patterns/scripts/api_validator.py +211 -0
- package/templates/skills/knowledge/api-patterns/security-testing.md +122 -0
- package/templates/skills/knowledge/api-patterns/trpc.md +41 -0
- package/templates/skills/knowledge/api-patterns/versioning.md +22 -0
- package/templates/skills/knowledge/app-builder/SKILL.md +75 -0
- package/templates/skills/knowledge/app-builder/agent-coordination.md +71 -0
- package/templates/skills/knowledge/app-builder/feature-building.md +53 -0
- package/templates/skills/knowledge/app-builder/project-detection.md +34 -0
- package/templates/skills/knowledge/app-builder/scaffolding.md +118 -0
- package/templates/skills/knowledge/app-builder/tech-stack.md +40 -0
- package/templates/skills/knowledge/app-builder/templates/SKILL.md +39 -0
- package/templates/skills/knowledge/app-builder/templates/astro-static/TEMPLATE.md +76 -0
- package/templates/skills/knowledge/app-builder/templates/chrome-extension/TEMPLATE.md +92 -0
- package/templates/skills/knowledge/app-builder/templates/cli-tool/TEMPLATE.md +88 -0
- package/templates/skills/knowledge/app-builder/templates/electron-desktop/TEMPLATE.md +88 -0
- package/templates/skills/knowledge/app-builder/templates/express-api/TEMPLATE.md +83 -0
- package/templates/skills/knowledge/app-builder/templates/flutter-app/TEMPLATE.md +90 -0
- package/templates/skills/knowledge/app-builder/templates/monorepo-turborepo/TEMPLATE.md +90 -0
- package/templates/skills/knowledge/app-builder/templates/nextjs-fullstack/TEMPLATE.md +82 -0
- package/templates/skills/knowledge/app-builder/templates/nextjs-saas/TEMPLATE.md +100 -0
- package/templates/skills/knowledge/app-builder/templates/nextjs-static/TEMPLATE.md +106 -0
- package/templates/skills/knowledge/app-builder/templates/nuxt-app/TEMPLATE.md +101 -0
- package/templates/skills/knowledge/app-builder/templates/python-fastapi/TEMPLATE.md +83 -0
- package/templates/skills/knowledge/app-builder/templates/react-native-app/TEMPLATE.md +93 -0
- package/templates/skills/knowledge/architecture/SKILL.md +55 -0
- package/templates/skills/knowledge/architecture/context-discovery.md +43 -0
- package/templates/skills/knowledge/architecture/examples.md +94 -0
- package/templates/skills/knowledge/architecture/pattern-selection.md +68 -0
- package/templates/skills/knowledge/architecture/patterns-reference.md +50 -0
- package/templates/skills/knowledge/architecture/trade-off-analysis.md +77 -0
- package/templates/skills/knowledge/bash-linux/SKILL.md +199 -0
- package/templates/skills/knowledge/behavioral-modes/SKILL.md +242 -0
- package/templates/skills/knowledge/brainstorming/SKILL.md +163 -0
- package/templates/skills/knowledge/brainstorming/dynamic-questioning.md +350 -0
- package/templates/skills/knowledge/clean-code/SKILL.md +201 -0
- package/templates/skills/knowledge/code-review-checklist/SKILL.md +109 -0
- package/templates/skills/knowledge/database-design/SKILL.md +52 -0
- package/templates/skills/knowledge/database-design/database-selection.md +43 -0
- package/templates/skills/knowledge/database-design/indexing.md +39 -0
- package/templates/skills/knowledge/database-design/migrations.md +48 -0
- package/templates/skills/knowledge/database-design/optimization.md +36 -0
- package/templates/skills/knowledge/database-design/orm-selection.md +30 -0
- package/templates/skills/knowledge/database-design/schema-design.md +56 -0
- package/templates/skills/knowledge/database-design/scripts/schema_validator.py +172 -0
- package/templates/skills/knowledge/deployment-procedures/SKILL.md +241 -0
- package/templates/skills/knowledge/doc.md +177 -0
- package/templates/skills/knowledge/documentation-templates/SKILL.md +194 -0
- package/templates/skills/knowledge/frontend-design/SKILL.md +396 -0
- package/templates/skills/knowledge/frontend-design/animation-guide.md +331 -0
- package/templates/skills/knowledge/frontend-design/color-system.md +311 -0
- package/templates/skills/knowledge/frontend-design/decision-trees.md +418 -0
- package/templates/skills/knowledge/frontend-design/motion-graphics.md +306 -0
- package/templates/skills/knowledge/frontend-design/scripts/accessibility_checker.py +183 -0
- package/templates/skills/knowledge/frontend-design/scripts/ux_audit.py +722 -0
- package/templates/skills/knowledge/frontend-design/typography-system.md +345 -0
- package/templates/skills/knowledge/frontend-design/ux-psychology.md +541 -0
- package/templates/skills/knowledge/frontend-design/visual-effects.md +383 -0
- package/templates/skills/knowledge/game-development/2d-games/SKILL.md +119 -0
- package/templates/skills/knowledge/game-development/3d-games/SKILL.md +135 -0
- package/templates/skills/knowledge/game-development/SKILL.md +167 -0
- package/templates/skills/knowledge/game-development/game-art/SKILL.md +185 -0
- package/templates/skills/knowledge/game-development/game-audio/SKILL.md +190 -0
- package/templates/skills/knowledge/game-development/game-design/SKILL.md +129 -0
- package/templates/skills/knowledge/game-development/mobile-games/SKILL.md +108 -0
- package/templates/skills/knowledge/game-development/multiplayer/SKILL.md +132 -0
- package/templates/skills/knowledge/game-development/pc-games/SKILL.md +144 -0
- package/templates/skills/knowledge/game-development/vr-ar/SKILL.md +123 -0
- package/templates/skills/knowledge/game-development/web-games/SKILL.md +150 -0
- package/templates/skills/knowledge/geo-fundamentals/SKILL.md +156 -0
- package/templates/skills/knowledge/geo-fundamentals/scripts/geo_checker.py +289 -0
- package/templates/skills/knowledge/i18n-localization/SKILL.md +154 -0
- package/templates/skills/knowledge/i18n-localization/scripts/i18n_checker.py +241 -0
- package/templates/skills/knowledge/intelligent-routing/SKILL.md +334 -0
- package/templates/skills/knowledge/lint-and-validate/SKILL.md +45 -0
- package/templates/skills/knowledge/lint-and-validate/scripts/lint_runner.py +172 -0
- package/templates/skills/knowledge/lint-and-validate/scripts/type_coverage.py +173 -0
- package/templates/skills/knowledge/mcp-builder/SKILL.md +176 -0
- package/templates/skills/knowledge/mobile-design/SKILL.md +394 -0
- package/templates/skills/knowledge/mobile-design/decision-trees.md +516 -0
- package/templates/skills/knowledge/mobile-design/mobile-backend.md +491 -0
- package/templates/skills/knowledge/mobile-design/mobile-color-system.md +420 -0
- package/templates/skills/knowledge/mobile-design/mobile-debugging.md +122 -0
- package/templates/skills/knowledge/mobile-design/mobile-design-thinking.md +357 -0
- package/templates/skills/knowledge/mobile-design/mobile-navigation.md +458 -0
- package/templates/skills/knowledge/mobile-design/mobile-performance.md +767 -0
- package/templates/skills/knowledge/mobile-design/mobile-testing.md +356 -0
- package/templates/skills/knowledge/mobile-design/mobile-typography.md +433 -0
- package/templates/skills/knowledge/mobile-design/platform-android.md +666 -0
- package/templates/skills/knowledge/mobile-design/platform-ios.md +561 -0
- package/templates/skills/knowledge/mobile-design/scripts/mobile_audit.py +670 -0
- package/templates/skills/knowledge/mobile-design/touch-psychology.md +537 -0
- package/templates/skills/knowledge/nextjs-best-practices/SKILL.md +203 -0
- package/templates/skills/knowledge/nodejs-best-practices/SKILL.md +333 -0
- package/templates/skills/knowledge/parallel-agents/SKILL.md +175 -0
- package/templates/skills/knowledge/performance-profiling/SKILL.md +143 -0
- package/templates/skills/knowledge/performance-profiling/scripts/lighthouse_audit.py +76 -0
- package/templates/skills/knowledge/plan-writing/SKILL.md +152 -0
- package/templates/skills/knowledge/powershell-windows/SKILL.md +167 -0
- package/templates/skills/knowledge/python-patterns/SKILL.md +441 -0
- package/templates/skills/knowledge/react-patterns/SKILL.md +198 -0
- package/templates/skills/knowledge/red-team-tactics/SKILL.md +199 -0
- package/templates/skills/knowledge/seo-fundamentals/SKILL.md +129 -0
- package/templates/skills/knowledge/seo-fundamentals/scripts/seo_checker.py +219 -0
- package/templates/skills/knowledge/server-management/SKILL.md +161 -0
- package/templates/skills/knowledge/systematic-debugging/SKILL.md +109 -0
- package/templates/skills/knowledge/tailwind-patterns/SKILL.md +269 -0
- package/templates/skills/knowledge/tdd-workflow/SKILL.md +149 -0
- package/templates/skills/knowledge/testing-patterns/SKILL.md +178 -0
- package/templates/skills/knowledge/testing-patterns/scripts/test_runner.py +219 -0
- package/templates/skills/knowledge/vulnerability-scanner/SKILL.md +276 -0
- package/templates/skills/knowledge/vulnerability-scanner/checklists.md +121 -0
- package/templates/skills/knowledge/vulnerability-scanner/scripts/security_scan.py +458 -0
- package/templates/skills/knowledge/webapp-testing/SKILL.md +187 -0
- package/templates/skills/knowledge/webapp-testing/scripts/playwright_runner.py +173 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Script: extract_text.py
|
|
4
|
+
Purpose: Extract text from PDF files with multiple fallback methods
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python extract_text.py <pdf_path> [--output <file>] [--method <method>] [--pages <range>]
|
|
8
|
+
|
|
9
|
+
Arguments:
|
|
10
|
+
pdf_path Path to PDF file (required)
|
|
11
|
+
--output, -o Output file path (default: stdout)
|
|
12
|
+
--method, -m Extraction method: auto|pdfplumber|pymupdf|pdfminer (default: auto)
|
|
13
|
+
--pages, -p Page range, e.g., "1-5" or "1,3,5" (default: all)
|
|
14
|
+
--preserve-layout Keep spatial layout (default: false)
|
|
15
|
+
--json Output as JSON with metadata
|
|
16
|
+
|
|
17
|
+
Exit Codes: 0=success, 1=args, 2=file not found, 3=library error, 4=extraction error
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def check_dependencies():
|
|
27
|
+
"""Check which PDF libraries are available."""
|
|
28
|
+
available = {}
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import pdfplumber
|
|
32
|
+
available['pdfplumber'] = pdfplumber
|
|
33
|
+
except ImportError:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
import fitz # PyMuPDF
|
|
38
|
+
available['pymupdf'] = fitz
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
from pdfminer.high_level import extract_text as pdfminer_extract
|
|
44
|
+
available['pdfminer'] = pdfminer_extract
|
|
45
|
+
except ImportError:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
return available
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def parse_page_range(page_str: str, total_pages: int) -> list[int]:
|
|
52
|
+
"""Parse page range string into list of 0-indexed page numbers."""
|
|
53
|
+
if not page_str:
|
|
54
|
+
return list(range(total_pages))
|
|
55
|
+
|
|
56
|
+
pages = set()
|
|
57
|
+
for part in page_str.split(','):
|
|
58
|
+
if '-' in part:
|
|
59
|
+
start, end = part.split('-', 1)
|
|
60
|
+
start = int(start) - 1 if start else 0
|
|
61
|
+
end = int(end) if end else total_pages
|
|
62
|
+
pages.update(range(start, min(end, total_pages)))
|
|
63
|
+
else:
|
|
64
|
+
page = int(part) - 1
|
|
65
|
+
if 0 <= page < total_pages:
|
|
66
|
+
pages.add(page)
|
|
67
|
+
|
|
68
|
+
return sorted(pages)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_with_pdfplumber(pdf_path: Path, pages: list[int] = None, preserve_layout: bool = False) -> dict:
|
|
72
|
+
"""Extract text using pdfplumber (best for tables and complex layouts)."""
|
|
73
|
+
import pdfplumber
|
|
74
|
+
|
|
75
|
+
result = {"pages": [], "metadata": {}}
|
|
76
|
+
|
|
77
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
78
|
+
result["metadata"] = {
|
|
79
|
+
"total_pages": len(pdf.pages),
|
|
80
|
+
"method": "pdfplumber"
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if pdf.metadata:
|
|
84
|
+
result["metadata"].update({k: v for k, v in pdf.metadata.items() if v})
|
|
85
|
+
|
|
86
|
+
page_indices = pages if pages else range(len(pdf.pages))
|
|
87
|
+
|
|
88
|
+
for i in page_indices:
|
|
89
|
+
page = pdf.pages[i]
|
|
90
|
+
if preserve_layout:
|
|
91
|
+
text = page.extract_text(layout=True) or ""
|
|
92
|
+
else:
|
|
93
|
+
text = page.extract_text() or ""
|
|
94
|
+
|
|
95
|
+
result["pages"].append({
|
|
96
|
+
"page_num": i + 1,
|
|
97
|
+
"text": text,
|
|
98
|
+
"width": page.width,
|
|
99
|
+
"height": page.height
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def extract_with_pymupdf(pdf_path: Path, pages: list[int] = None, preserve_layout: bool = False) -> dict:
|
|
106
|
+
"""Extract text using PyMuPDF/fitz (fastest, good OCR support)."""
|
|
107
|
+
import fitz
|
|
108
|
+
|
|
109
|
+
result = {"pages": [], "metadata": {}}
|
|
110
|
+
|
|
111
|
+
doc = fitz.open(pdf_path)
|
|
112
|
+
|
|
113
|
+
result["metadata"] = {
|
|
114
|
+
"total_pages": len(doc),
|
|
115
|
+
"method": "pymupdf"
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if doc.metadata:
|
|
119
|
+
result["metadata"].update({k: v for k, v in doc.metadata.items() if v})
|
|
120
|
+
|
|
121
|
+
page_indices = pages if pages else range(len(doc))
|
|
122
|
+
|
|
123
|
+
flags = fitz.TEXT_PRESERVE_WHITESPACE if preserve_layout else 0
|
|
124
|
+
|
|
125
|
+
for i in page_indices:
|
|
126
|
+
page = doc[i]
|
|
127
|
+
text = page.get_text("text", flags=flags)
|
|
128
|
+
|
|
129
|
+
result["pages"].append({
|
|
130
|
+
"page_num": i + 1,
|
|
131
|
+
"text": text,
|
|
132
|
+
"width": page.rect.width,
|
|
133
|
+
"height": page.rect.height
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
doc.close()
|
|
137
|
+
return result
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def extract_with_pdfminer(pdf_path: Path, pages: list[int] = None, preserve_layout: bool = False) -> dict:
|
|
141
|
+
"""Extract text using pdfminer (best text accuracy, slower)."""
|
|
142
|
+
from pdfminer.high_level import extract_text, extract_pages
|
|
143
|
+
from pdfminer.layout import LAParams
|
|
144
|
+
|
|
145
|
+
result = {"pages": [], "metadata": {"method": "pdfminer"}}
|
|
146
|
+
|
|
147
|
+
laparams = LAParams(
|
|
148
|
+
detect_vertical=preserve_layout,
|
|
149
|
+
all_texts=True
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Get total pages
|
|
153
|
+
all_pages = list(extract_pages(pdf_path, laparams=laparams))
|
|
154
|
+
result["metadata"]["total_pages"] = len(all_pages)
|
|
155
|
+
|
|
156
|
+
page_indices = pages if pages else range(len(all_pages))
|
|
157
|
+
|
|
158
|
+
for i in page_indices:
|
|
159
|
+
if i < len(all_pages):
|
|
160
|
+
page = all_pages[i]
|
|
161
|
+
text_content = []
|
|
162
|
+
for element in page:
|
|
163
|
+
if hasattr(element, 'get_text'):
|
|
164
|
+
text_content.append(element.get_text())
|
|
165
|
+
|
|
166
|
+
result["pages"].append({
|
|
167
|
+
"page_num": i + 1,
|
|
168
|
+
"text": "".join(text_content),
|
|
169
|
+
"width": page.width,
|
|
170
|
+
"height": page.height
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def extract_text(pdf_path: Path, method: str = "auto", pages: list[int] = None,
|
|
177
|
+
preserve_layout: bool = False) -> dict:
|
|
178
|
+
"""Extract text from PDF using specified or best available method."""
|
|
179
|
+
available = check_dependencies()
|
|
180
|
+
|
|
181
|
+
if not available:
|
|
182
|
+
print(json.dumps({
|
|
183
|
+
"status": "error",
|
|
184
|
+
"message": "No PDF library available. Install: pip install pdfplumber pymupdf pdfminer.six"
|
|
185
|
+
}), file=sys.stderr)
|
|
186
|
+
sys.exit(3)
|
|
187
|
+
|
|
188
|
+
# Method selection
|
|
189
|
+
if method == "auto":
|
|
190
|
+
# Preference order: pdfplumber (best tables), pymupdf (fastest), pdfminer (most accurate)
|
|
191
|
+
if 'pdfplumber' in available:
|
|
192
|
+
method = 'pdfplumber'
|
|
193
|
+
elif 'pymupdf' in available:
|
|
194
|
+
method = 'pymupdf'
|
|
195
|
+
else:
|
|
196
|
+
method = 'pdfminer'
|
|
197
|
+
|
|
198
|
+
if method not in available:
|
|
199
|
+
print(json.dumps({
|
|
200
|
+
"status": "error",
|
|
201
|
+
"message": f"Method '{method}' not available. Install the corresponding library.",
|
|
202
|
+
"available": list(available.keys())
|
|
203
|
+
}), file=sys.stderr)
|
|
204
|
+
sys.exit(3)
|
|
205
|
+
|
|
206
|
+
# Extract based on method
|
|
207
|
+
extractors = {
|
|
208
|
+
'pdfplumber': extract_with_pdfplumber,
|
|
209
|
+
'pymupdf': extract_with_pymupdf,
|
|
210
|
+
'pdfminer': extract_with_pdfminer
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return extractors[method](pdf_path, pages, preserve_layout)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def main():
|
|
217
|
+
parser = argparse.ArgumentParser(
|
|
218
|
+
description="Extract text from PDF files",
|
|
219
|
+
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
220
|
+
)
|
|
221
|
+
parser.add_argument('pdf_path', help='Path to PDF file')
|
|
222
|
+
parser.add_argument('-o', '--output', help='Output file path')
|
|
223
|
+
parser.add_argument('-m', '--method', default='auto',
|
|
224
|
+
choices=['auto', 'pdfplumber', 'pymupdf', 'pdfminer'],
|
|
225
|
+
help='Extraction method')
|
|
226
|
+
parser.add_argument('-p', '--pages', help='Page range (e.g., "1-5" or "1,3,5")')
|
|
227
|
+
parser.add_argument('--preserve-layout', action='store_true',
|
|
228
|
+
help='Preserve spatial layout')
|
|
229
|
+
parser.add_argument('--json', action='store_true', dest='output_json',
|
|
230
|
+
help='Output as JSON with metadata')
|
|
231
|
+
|
|
232
|
+
args = parser.parse_args()
|
|
233
|
+
|
|
234
|
+
pdf_path = Path(args.pdf_path)
|
|
235
|
+
if not pdf_path.exists():
|
|
236
|
+
print(json.dumps({"status": "error", "message": f"File not found: {pdf_path}"}),
|
|
237
|
+
file=sys.stderr)
|
|
238
|
+
sys.exit(2)
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
# Get total pages first for page range parsing
|
|
242
|
+
available = check_dependencies()
|
|
243
|
+
if not available:
|
|
244
|
+
sys.exit(3)
|
|
245
|
+
|
|
246
|
+
# Quick page count
|
|
247
|
+
if 'pymupdf' in available:
|
|
248
|
+
import fitz
|
|
249
|
+
doc = fitz.open(pdf_path)
|
|
250
|
+
total_pages = len(doc)
|
|
251
|
+
doc.close()
|
|
252
|
+
elif 'pdfplumber' in available:
|
|
253
|
+
import pdfplumber
|
|
254
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
255
|
+
total_pages = len(pdf.pages)
|
|
256
|
+
else:
|
|
257
|
+
total_pages = 9999 # Will be limited during extraction
|
|
258
|
+
|
|
259
|
+
pages = parse_page_range(args.pages, total_pages) if args.pages else None
|
|
260
|
+
|
|
261
|
+
result = extract_text(
|
|
262
|
+
pdf_path,
|
|
263
|
+
method=args.method,
|
|
264
|
+
pages=pages,
|
|
265
|
+
preserve_layout=args.preserve_layout
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Format output
|
|
269
|
+
if args.output_json:
|
|
270
|
+
output = json.dumps({"status": "success", **result}, indent=2)
|
|
271
|
+
else:
|
|
272
|
+
# Plain text output
|
|
273
|
+
output = "\n\n".join(
|
|
274
|
+
f"--- Page {p['page_num']} ---\n{p['text']}"
|
|
275
|
+
for p in result['pages']
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if args.output:
|
|
279
|
+
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
|
280
|
+
Path(args.output).write_text(output)
|
|
281
|
+
print(json.dumps({"status": "success", "output": args.output,
|
|
282
|
+
"pages_extracted": len(result['pages'])}))
|
|
283
|
+
else:
|
|
284
|
+
print(output)
|
|
285
|
+
|
|
286
|
+
sys.exit(0)
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
print(json.dumps({"status": "error", "message": str(e), "type": type(e).__name__}),
|
|
290
|
+
file=sys.stderr)
|
|
291
|
+
sys.exit(4)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
if __name__ == '__main__':
|
|
295
|
+
main()
|