@brookmind/ai-toolkit 1.1.7 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +42 -14
- package/dist/__tests__/constants.test.d.ts +2 -0
- package/dist/__tests__/constants.test.d.ts.map +1 -0
- package/dist/__tests__/constants.test.js +102 -0
- package/dist/__tests__/constants.test.js.map +1 -0
- package/dist/__tests__/index.test.d.ts +2 -0
- package/dist/__tests__/index.test.d.ts.map +1 -0
- package/dist/__tests__/index.test.js +114 -0
- package/dist/__tests__/index.test.js.map +1 -0
- package/dist/__tests__/integration/installer.test.d.ts +2 -0
- package/dist/__tests__/integration/installer.test.d.ts.map +1 -0
- package/dist/__tests__/integration/installer.test.js +425 -0
- package/dist/__tests__/integration/installer.test.js.map +1 -0
- package/dist/__tests__/services/installers.test.d.ts +2 -0
- package/dist/__tests__/services/installers.test.d.ts.map +1 -0
- package/dist/__tests__/services/installers.test.js +222 -0
- package/dist/__tests__/services/installers.test.js.map +1 -0
- package/dist/__tests__/services/opencode.test.d.ts +2 -0
- package/dist/__tests__/services/opencode.test.d.ts.map +1 -0
- package/dist/__tests__/services/opencode.test.js +120 -0
- package/dist/__tests__/services/opencode.test.js.map +1 -0
- package/dist/__tests__/ui/categorize.test.d.ts +2 -0
- package/dist/__tests__/ui/categorize.test.d.ts.map +1 -0
- package/dist/__tests__/ui/categorize.test.js +194 -0
- package/dist/__tests__/ui/categorize.test.js.map +1 -0
- package/dist/__tests__/ui/choices.test.d.ts +2 -0
- package/dist/__tests__/ui/choices.test.d.ts.map +1 -0
- package/dist/__tests__/ui/choices.test.js +180 -0
- package/dist/__tests__/ui/choices.test.js.map +1 -0
- package/dist/__tests__/ui/display.test.d.ts +2 -0
- package/dist/__tests__/ui/display.test.d.ts.map +1 -0
- package/dist/__tests__/ui/display.test.js +142 -0
- package/dist/__tests__/ui/display.test.js.map +1 -0
- package/dist/__tests__/utils/fs.test.d.ts +2 -0
- package/dist/__tests__/utils/fs.test.d.ts.map +1 -0
- package/dist/__tests__/utils/fs.test.js +142 -0
- package/dist/__tests__/utils/fs.test.js.map +1 -0
- package/dist/__tests__/utils/terminal.test.d.ts +2 -0
- package/dist/__tests__/utils/terminal.test.d.ts.map +1 -0
- package/dist/__tests__/utils/terminal.test.js +97 -0
- package/dist/__tests__/utils/terminal.test.js.map +1 -0
- package/dist/constants.d.ts +11 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +40 -0
- package/dist/constants.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +49 -332
- package/dist/index.js.map +1 -1
- package/dist/services/installers.d.ts +8 -0
- package/dist/services/installers.d.ts.map +1 -0
- package/dist/services/installers.js +79 -0
- package/dist/services/installers.js.map +1 -0
- package/dist/services/opencode.d.ts +3 -0
- package/dist/services/opencode.d.ts.map +1 -0
- package/dist/services/opencode.js +33 -0
- package/dist/services/opencode.js.map +1 -0
- package/dist/types.d.ts +10 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/ui/categorize.d.ts +6 -0
- package/dist/ui/categorize.d.ts.map +1 -0
- package/dist/ui/categorize.js +69 -0
- package/dist/ui/categorize.js.map +1 -0
- package/dist/ui/choices.d.ts +6 -0
- package/dist/ui/choices.d.ts.map +1 -0
- package/dist/ui/choices.js +70 -0
- package/dist/ui/choices.js.map +1 -0
- package/dist/ui/display.d.ts +8 -0
- package/dist/ui/display.d.ts.map +1 -0
- package/dist/ui/display.js +86 -0
- package/dist/ui/display.js.map +1 -0
- package/dist/utils/fs.d.ts +5 -0
- package/dist/utils/fs.d.ts.map +1 -0
- package/dist/utils/fs.js +40 -0
- package/dist/utils/fs.js.map +1 -0
- package/dist/utils/terminal.d.ts +5 -0
- package/dist/utils/terminal.d.ts.map +1 -0
- package/dist/utils/terminal.js +18 -0
- package/dist/utils/terminal.js.map +1 -0
- package/package.json +29 -5
- package/agents/code-reviewer.md +0 -35
- package/agents/code-simplifier.md +0 -52
- package/commands/create-pr-description.md +0 -102
- package/commands/create-pr.md +0 -76
- package/commands/create-react-tests.md +0 -207
- package/mcps/context7/.mcp.json +0 -13
- package/mcps/expo-mcp/.mcp.json +0 -13
- package/mcps/figma-mcp/.mcp.json +0 -10
- package/skills/github-cli/SKILL.md +0 -125
- package/skills/pdf-processing-pro/FORMS.md +0 -610
- package/skills/pdf-processing-pro/OCR.md +0 -137
- package/skills/pdf-processing-pro/SKILL.md +0 -296
- package/skills/pdf-processing-pro/TABLES.md +0 -626
- package/skills/pdf-processing-pro/scripts/analyze_form.py +0 -307
- package/skills/react-best-practices/AGENTS.md +0 -915
- package/skills/react-best-practices/README.md +0 -127
- package/skills/react-best-practices/SKILL.md +0 -110
- package/skills/react-best-practices/metadata.json +0 -14
- package/skills/react-best-practices/rules/_sections.md +0 -41
- package/skills/react-best-practices/rules/_template.md +0 -28
- package/skills/react-best-practices/rules/advanced-event-handler-refs.md +0 -80
- package/skills/react-best-practices/rules/advanced-use-latest.md +0 -76
- package/skills/react-best-practices/rules/async-defer-await.md +0 -80
- package/skills/react-best-practices/rules/async-dependencies.md +0 -36
- package/skills/react-best-practices/rules/async-parallel.md +0 -28
- package/skills/react-best-practices/rules/async-suspense-boundaries.md +0 -100
- package/skills/react-best-practices/rules/bundle-barrel-imports.md +0 -42
- package/skills/react-best-practices/rules/bundle-conditional.md +0 -106
- package/skills/react-best-practices/rules/bundle-preload.md +0 -44
- package/skills/react-best-practices/rules/client-event-listeners.md +0 -131
- package/skills/react-best-practices/rules/client-swr-dedup.md +0 -133
- package/skills/react-best-practices/rules/js-batch-dom-css.md +0 -82
- package/skills/react-best-practices/rules/js-cache-function-results.md +0 -80
- package/skills/react-best-practices/rules/js-cache-property-access.md +0 -28
- package/skills/react-best-practices/rules/js-cache-storage.md +0 -70
- package/skills/react-best-practices/rules/js-combine-iterations.md +0 -32
- package/skills/react-best-practices/rules/js-early-exit.md +0 -50
- package/skills/react-best-practices/rules/js-hoist-regexp.md +0 -45
- package/skills/react-best-practices/rules/js-index-maps.md +0 -37
- package/skills/react-best-practices/rules/js-length-check-first.md +0 -49
- package/skills/react-best-practices/rules/js-min-max-loop.md +0 -82
- package/skills/react-best-practices/rules/js-set-map-lookups.md +0 -24
- package/skills/react-best-practices/rules/js-tosorted-immutable.md +0 -57
- package/skills/react-best-practices/rules/rendering-activity.md +0 -90
- package/skills/react-best-practices/rules/rendering-animate-svg-wrapper.md +0 -47
- package/skills/react-best-practices/rules/rendering-conditional-render.md +0 -40
- package/skills/react-best-practices/rules/rendering-content-visibility.md +0 -38
- package/skills/react-best-practices/rules/rendering-hoist-jsx.md +0 -65
- package/skills/react-best-practices/rules/rendering-svg-precision.md +0 -28
- package/skills/react-best-practices/rules/rerender-defer-reads.md +0 -39
- package/skills/react-best-practices/rules/rerender-dependencies.md +0 -45
- package/skills/react-best-practices/rules/rerender-derived-state.md +0 -29
- package/skills/react-best-practices/rules/rerender-functional-setstate.md +0 -74
- package/skills/react-best-practices/rules/rerender-lazy-state-init.md +0 -58
- package/skills/react-best-practices/rules/rerender-memo.md +0 -85
- package/skills/react-best-practices/rules/rerender-transitions.md +0 -40
- package/skills/skill-creator/LICENSE.txt +0 -202
- package/skills/skill-creator/SKILL.md +0 -209
- package/skills/skill-creator/scripts/init_skill.py +0 -303
- package/skills/skill-creator/scripts/package_skill.py +0 -110
- package/skills/skill-creator/scripts/quick_validate.py +0 -65
- package/skills/spring-boot-development/EXAMPLES.md +0 -2346
- package/skills/spring-boot-development/README.md +0 -595
- package/skills/spring-boot-development/SKILL.md +0 -1519
- package/themes/README.md +0 -68
- package/themes/claude-vivid.json +0 -72
|
@@ -1,626 +0,0 @@
|
|
|
1
|
-
# PDF Table Extraction Guide
|
|
2
|
-
|
|
3
|
-
Advanced table extraction strategies for production environments.
|
|
4
|
-
|
|
5
|
-
## Table of contents
|
|
6
|
-
|
|
7
|
-
- Basic table extraction
|
|
8
|
-
- Multi-page tables
|
|
9
|
-
- Complex table structures
|
|
10
|
-
- Export formats
|
|
11
|
-
- Table detection algorithms
|
|
12
|
-
- Custom extraction rules
|
|
13
|
-
- Performance optimization
|
|
14
|
-
- Production examples
|
|
15
|
-
|
|
16
|
-
## Basic table extraction
|
|
17
|
-
|
|
18
|
-
### Using pdfplumber (recommended)
|
|
19
|
-
|
|
20
|
-
```python
|
|
21
|
-
import pdfplumber
|
|
22
|
-
|
|
23
|
-
with pdfplumber.open("report.pdf") as pdf:
|
|
24
|
-
page = pdf.pages[0]
|
|
25
|
-
tables = page.extract_tables()
|
|
26
|
-
|
|
27
|
-
for i, table in enumerate(tables):
|
|
28
|
-
print(f"\nTable {i + 1}:")
|
|
29
|
-
for row in table:
|
|
30
|
-
print(row)
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### Using included script
|
|
34
|
-
|
|
35
|
-
```bash
|
|
36
|
-
python scripts/extract_tables.py report.pdf --output tables.csv
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
Output:
|
|
40
|
-
```csv
|
|
41
|
-
Name,Age,City
|
|
42
|
-
John Doe,30,New York
|
|
43
|
-
Jane Smith,25,Los Angeles
|
|
44
|
-
Bob Johnson,35,Chicago
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
## Table extraction strategies
|
|
48
|
-
|
|
49
|
-
### Strategy 1: Automatic detection
|
|
50
|
-
|
|
51
|
-
Let pdfplumber auto-detect tables:
|
|
52
|
-
|
|
53
|
-
```python
|
|
54
|
-
import pdfplumber
|
|
55
|
-
|
|
56
|
-
with pdfplumber.open("document.pdf") as pdf:
|
|
57
|
-
for page_num, page in enumerate(pdf.pages, 1):
|
|
58
|
-
tables = page.extract_tables()
|
|
59
|
-
|
|
60
|
-
if tables:
|
|
61
|
-
print(f"Found {len(tables)} table(s) on page {page_num}")
|
|
62
|
-
|
|
63
|
-
for table_num, table in enumerate(tables, 1):
|
|
64
|
-
print(f"\nTable {table_num}:")
|
|
65
|
-
# First row is usually headers
|
|
66
|
-
headers = table[0]
|
|
67
|
-
print(f"Columns: {headers}")
|
|
68
|
-
|
|
69
|
-
# Data rows
|
|
70
|
-
for row in table[1:]:
|
|
71
|
-
print(row)
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
### Strategy 2: Custom table settings
|
|
75
|
-
|
|
76
|
-
Fine-tune detection with custom settings:
|
|
77
|
-
|
|
78
|
-
```python
|
|
79
|
-
import pdfplumber
|
|
80
|
-
|
|
81
|
-
table_settings = {
|
|
82
|
-
"vertical_strategy": "lines", # or "text", "lines_strict"
|
|
83
|
-
"horizontal_strategy": "lines",
|
|
84
|
-
"explicit_vertical_lines": [],
|
|
85
|
-
"explicit_horizontal_lines": [],
|
|
86
|
-
"snap_tolerance": 3,
|
|
87
|
-
"join_tolerance": 3,
|
|
88
|
-
"edge_min_length": 3,
|
|
89
|
-
"min_words_vertical": 3,
|
|
90
|
-
"min_words_horizontal": 1,
|
|
91
|
-
"keep_blank_chars": False,
|
|
92
|
-
"text_tolerance": 3,
|
|
93
|
-
"text_x_tolerance": 3,
|
|
94
|
-
"text_y_tolerance": 3,
|
|
95
|
-
"intersection_tolerance": 3
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
with pdfplumber.open("document.pdf") as pdf:
|
|
99
|
-
page = pdf.pages[0]
|
|
100
|
-
tables = page.extract_tables(table_settings=table_settings)
|
|
101
|
-
```
|
|
102
|
-
|
|
103
|
-
### Strategy 3: Explicit boundaries
|
|
104
|
-
|
|
105
|
-
Define table boundaries manually:
|
|
106
|
-
|
|
107
|
-
```python
|
|
108
|
-
import pdfplumber
|
|
109
|
-
|
|
110
|
-
with pdfplumber.open("document.pdf") as pdf:
|
|
111
|
-
page = pdf.pages[0]
|
|
112
|
-
|
|
113
|
-
# Define bounding box (x0, top, x1, bottom)
|
|
114
|
-
bbox = (50, 100, 550, 700)
|
|
115
|
-
|
|
116
|
-
# Extract table within bounding box
|
|
117
|
-
cropped = page.within_bbox(bbox)
|
|
118
|
-
tables = cropped.extract_tables()
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
## Multi-page tables
|
|
122
|
-
|
|
123
|
-
### Detect and merge multi-page tables
|
|
124
|
-
|
|
125
|
-
```python
|
|
126
|
-
import pdfplumber
|
|
127
|
-
|
|
128
|
-
def extract_multipage_table(pdf_path, start_page=0, end_page=None):
|
|
129
|
-
"""Extract table that spans multiple pages."""
|
|
130
|
-
|
|
131
|
-
all_rows = []
|
|
132
|
-
headers = None
|
|
133
|
-
|
|
134
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
135
|
-
pages = pdf.pages[start_page:end_page]
|
|
136
|
-
|
|
137
|
-
for page_num, page in enumerate(pages):
|
|
138
|
-
tables = page.extract_tables()
|
|
139
|
-
|
|
140
|
-
if not tables:
|
|
141
|
-
continue
|
|
142
|
-
|
|
143
|
-
# Assume first table on page
|
|
144
|
-
table = tables[0]
|
|
145
|
-
|
|
146
|
-
if page_num == 0:
|
|
147
|
-
# First page: capture headers and data
|
|
148
|
-
headers = table[0]
|
|
149
|
-
all_rows.extend(table[1:])
|
|
150
|
-
else:
|
|
151
|
-
# Subsequent pages: skip headers if they repeat
|
|
152
|
-
if table[0] == headers:
|
|
153
|
-
all_rows.extend(table[1:])
|
|
154
|
-
else:
|
|
155
|
-
all_rows.extend(table)
|
|
156
|
-
|
|
157
|
-
return [headers] + all_rows if headers else all_rows
|
|
158
|
-
|
|
159
|
-
# Usage
|
|
160
|
-
table = extract_multipage_table("report.pdf", start_page=2, end_page=5)
|
|
161
|
-
|
|
162
|
-
print(f"Extracted {len(table) - 1} rows")
|
|
163
|
-
print(f"Columns: {table[0]}")
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
## Complex table structures
|
|
167
|
-
|
|
168
|
-
### Handling merged cells
|
|
169
|
-
|
|
170
|
-
```python
|
|
171
|
-
import pdfplumber
|
|
172
|
-
|
|
173
|
-
def handle_merged_cells(table):
|
|
174
|
-
"""Process table with merged cells."""
|
|
175
|
-
|
|
176
|
-
processed = []
|
|
177
|
-
|
|
178
|
-
for row in table:
|
|
179
|
-
new_row = []
|
|
180
|
-
last_value = None
|
|
181
|
-
|
|
182
|
-
for cell in row:
|
|
183
|
-
if cell is None or cell == "":
|
|
184
|
-
# Merged cell - use value from left
|
|
185
|
-
new_row.append(last_value)
|
|
186
|
-
else:
|
|
187
|
-
new_row.append(cell)
|
|
188
|
-
last_value = cell
|
|
189
|
-
|
|
190
|
-
processed.append(new_row)
|
|
191
|
-
|
|
192
|
-
return processed
|
|
193
|
-
|
|
194
|
-
# Usage
|
|
195
|
-
with pdfplumber.open("document.pdf") as pdf:
|
|
196
|
-
table = pdf.pages[0].extract_tables()[0]
|
|
197
|
-
clean_table = handle_merged_cells(table)
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
### Nested tables
|
|
201
|
-
|
|
202
|
-
```python
|
|
203
|
-
def extract_nested_tables(page, bbox):
|
|
204
|
-
"""Extract nested tables from a region."""
|
|
205
|
-
|
|
206
|
-
cropped = page.within_bbox(bbox)
|
|
207
|
-
|
|
208
|
-
# Try to detect sub-regions with tables
|
|
209
|
-
tables = cropped.extract_tables()
|
|
210
|
-
|
|
211
|
-
result = []
|
|
212
|
-
for table in tables:
|
|
213
|
-
# Process each nested table
|
|
214
|
-
if table:
|
|
215
|
-
result.append({
|
|
216
|
-
"type": "nested",
|
|
217
|
-
"data": table
|
|
218
|
-
})
|
|
219
|
-
|
|
220
|
-
return result
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
### Tables with varying column counts
|
|
224
|
-
|
|
225
|
-
```python
|
|
226
|
-
def normalize_table_columns(table):
|
|
227
|
-
"""Normalize table with inconsistent column counts."""
|
|
228
|
-
|
|
229
|
-
if not table:
|
|
230
|
-
return table
|
|
231
|
-
|
|
232
|
-
# Find max column count
|
|
233
|
-
max_cols = max(len(row) for row in table)
|
|
234
|
-
|
|
235
|
-
# Pad short rows
|
|
236
|
-
normalized = []
|
|
237
|
-
for row in table:
|
|
238
|
-
if len(row) < max_cols:
|
|
239
|
-
# Pad with empty strings
|
|
240
|
-
row = row + [""] * (max_cols - len(row))
|
|
241
|
-
normalized.append(row)
|
|
242
|
-
|
|
243
|
-
return normalized
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
## Export formats
|
|
247
|
-
|
|
248
|
-
### Export to CSV
|
|
249
|
-
|
|
250
|
-
```python
|
|
251
|
-
import csv
|
|
252
|
-
|
|
253
|
-
def export_to_csv(table, output_path):
|
|
254
|
-
"""Export table to CSV."""
|
|
255
|
-
|
|
256
|
-
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
257
|
-
writer = csv.writer(f)
|
|
258
|
-
writer.writerows(table)
|
|
259
|
-
|
|
260
|
-
# Usage
|
|
261
|
-
table = extract_table("report.pdf")
|
|
262
|
-
export_to_csv(table, "output.csv")
|
|
263
|
-
```
|
|
264
|
-
|
|
265
|
-
### Export to Excel
|
|
266
|
-
|
|
267
|
-
```python
|
|
268
|
-
import pandas as pd
|
|
269
|
-
|
|
270
|
-
def export_to_excel(tables, output_path):
|
|
271
|
-
"""Export multiple tables to Excel with sheets."""
|
|
272
|
-
|
|
273
|
-
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
|
274
|
-
for i, table in enumerate(tables):
|
|
275
|
-
if not table:
|
|
276
|
-
continue
|
|
277
|
-
|
|
278
|
-
# Convert to DataFrame
|
|
279
|
-
headers = table[0]
|
|
280
|
-
data = table[1:]
|
|
281
|
-
df = pd.DataFrame(data, columns=headers)
|
|
282
|
-
|
|
283
|
-
# Write to sheet
|
|
284
|
-
sheet_name = f"Table_{i + 1}"
|
|
285
|
-
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
286
|
-
|
|
287
|
-
# Auto-adjust column widths
|
|
288
|
-
worksheet = writer.sheets[sheet_name]
|
|
289
|
-
for column in worksheet.columns:
|
|
290
|
-
max_length = 0
|
|
291
|
-
column_letter = column[0].column_letter
|
|
292
|
-
for cell in column:
|
|
293
|
-
if len(str(cell.value)) > max_length:
|
|
294
|
-
max_length = len(str(cell.value))
|
|
295
|
-
worksheet.column_dimensions[column_letter].width = max_length + 2
|
|
296
|
-
|
|
297
|
-
# Usage
|
|
298
|
-
tables = extract_all_tables("report.pdf")
|
|
299
|
-
export_to_excel(tables, "output.xlsx")
|
|
300
|
-
```
|
|
301
|
-
|
|
302
|
-
### Export to JSON
|
|
303
|
-
|
|
304
|
-
```python
|
|
305
|
-
import json
|
|
306
|
-
|
|
307
|
-
def export_to_json(table, output_path):
|
|
308
|
-
"""Export table to JSON."""
|
|
309
|
-
|
|
310
|
-
if not table:
|
|
311
|
-
return
|
|
312
|
-
|
|
313
|
-
headers = table[0]
|
|
314
|
-
data = table[1:]
|
|
315
|
-
|
|
316
|
-
# Convert to list of dictionaries
|
|
317
|
-
records = []
|
|
318
|
-
for row in data:
|
|
319
|
-
record = {}
|
|
320
|
-
for i, header in enumerate(headers):
|
|
321
|
-
value = row[i] if i < len(row) else None
|
|
322
|
-
record[header] = value
|
|
323
|
-
records.append(record)
|
|
324
|
-
|
|
325
|
-
# Save to JSON
|
|
326
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
327
|
-
json.dump(records, f, indent=2)
|
|
328
|
-
|
|
329
|
-
# Usage
|
|
330
|
-
table = extract_table("report.pdf")
|
|
331
|
-
export_to_json(table, "output.json")
|
|
332
|
-
```
|
|
333
|
-
|
|
334
|
-
## Table detection algorithms
|
|
335
|
-
|
|
336
|
-
### Visual debugging
|
|
337
|
-
|
|
338
|
-
```python
|
|
339
|
-
import pdfplumber
|
|
340
|
-
|
|
341
|
-
def visualize_table_detection(pdf_path, page_num=0, output_path="debug.png"):
|
|
342
|
-
"""Visualize detected table structure."""
|
|
343
|
-
|
|
344
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
345
|
-
page = pdf.pages[page_num]
|
|
346
|
-
|
|
347
|
-
# Draw detected table lines
|
|
348
|
-
im = page.to_image(resolution=150)
|
|
349
|
-
im = im.debug_tablefinder()
|
|
350
|
-
im.save(output_path)
|
|
351
|
-
|
|
352
|
-
print(f"Saved debug image to {output_path}")
|
|
353
|
-
|
|
354
|
-
# Usage
|
|
355
|
-
visualize_table_detection("document.pdf", page_num=0)
|
|
356
|
-
```
|
|
357
|
-
|
|
358
|
-
### Algorithm: Line-based detection
|
|
359
|
-
|
|
360
|
-
Best for tables with visible borders:
|
|
361
|
-
|
|
362
|
-
```python
|
|
363
|
-
table_settings = {
|
|
364
|
-
"vertical_strategy": "lines",
|
|
365
|
-
"horizontal_strategy": "lines"
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
tables = page.extract_tables(table_settings=table_settings)
|
|
369
|
-
```
|
|
370
|
-
|
|
371
|
-
### Algorithm: Text-based detection
|
|
372
|
-
|
|
373
|
-
Best for tables without borders:
|
|
374
|
-
|
|
375
|
-
```python
|
|
376
|
-
table_settings = {
|
|
377
|
-
"vertical_strategy": "text",
|
|
378
|
-
"horizontal_strategy": "text"
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
tables = page.extract_tables(table_settings=table_settings)
|
|
382
|
-
```
|
|
383
|
-
|
|
384
|
-
### Algorithm: Explicit lines
|
|
385
|
-
|
|
386
|
-
For complex layouts, define lines manually:
|
|
387
|
-
|
|
388
|
-
```python
|
|
389
|
-
# Define vertical lines at x-coordinates
|
|
390
|
-
vertical_lines = [50, 150, 250, 350, 450, 550]
|
|
391
|
-
|
|
392
|
-
# Define horizontal lines at y-coordinates
|
|
393
|
-
horizontal_lines = [100, 130, 160, 190, 220, 250]
|
|
394
|
-
|
|
395
|
-
table_settings = {
|
|
396
|
-
"explicit_vertical_lines": vertical_lines,
|
|
397
|
-
"explicit_horizontal_lines": horizontal_lines
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
tables = page.extract_tables(table_settings=table_settings)
|
|
401
|
-
```
|
|
402
|
-
|
|
403
|
-
## Custom extraction rules
|
|
404
|
-
|
|
405
|
-
### Rule-based extraction
|
|
406
|
-
|
|
407
|
-
```python
|
|
408
|
-
def extract_with_rules(page, rules):
|
|
409
|
-
"""Extract table using custom rules."""
|
|
410
|
-
|
|
411
|
-
# Rule: "Headers are bold"
|
|
412
|
-
if rules.get("bold_headers"):
|
|
413
|
-
chars = page.chars
|
|
414
|
-
bold_chars = [c for c in chars if "Bold" in c.get("fontname", "")]
|
|
415
|
-
# Use bold chars to identify header row
|
|
416
|
-
pass
|
|
417
|
-
|
|
418
|
-
# Rule: "First column is always left-aligned"
|
|
419
|
-
if rules.get("left_align_first_col"):
|
|
420
|
-
# Adjust extraction to respect alignment
|
|
421
|
-
pass
|
|
422
|
-
|
|
423
|
-
# Rule: "Currency values in last column"
|
|
424
|
-
if rules.get("currency_last_col"):
|
|
425
|
-
# Parse currency format
|
|
426
|
-
pass
|
|
427
|
-
|
|
428
|
-
# Extract with adjusted settings
|
|
429
|
-
return page.extract_tables()
|
|
430
|
-
```
|
|
431
|
-
|
|
432
|
-
### Post-processing rules
|
|
433
|
-
|
|
434
|
-
```python
|
|
435
|
-
def apply_post_processing(table, rules):
|
|
436
|
-
"""Apply post-processing rules to extracted table."""
|
|
437
|
-
|
|
438
|
-
processed = []
|
|
439
|
-
|
|
440
|
-
for row in table:
|
|
441
|
-
new_row = []
|
|
442
|
-
|
|
443
|
-
for i, cell in enumerate(row):
|
|
444
|
-
value = cell
|
|
445
|
-
|
|
446
|
-
# Rule: Strip whitespace
|
|
447
|
-
if rules.get("strip_whitespace"):
|
|
448
|
-
value = value.strip() if value else value
|
|
449
|
-
|
|
450
|
-
# Rule: Convert currency to float
|
|
451
|
-
if rules.get("parse_currency") and i == len(row) - 1:
|
|
452
|
-
if value and "$" in value:
|
|
453
|
-
value = float(value.replace("$", "").replace(",", ""))
|
|
454
|
-
|
|
455
|
-
# Rule: Parse dates
|
|
456
|
-
if rules.get("parse_dates") and i == 0:
|
|
457
|
-
# Convert to datetime
|
|
458
|
-
pass
|
|
459
|
-
|
|
460
|
-
new_row.append(value)
|
|
461
|
-
|
|
462
|
-
processed.append(new_row)
|
|
463
|
-
|
|
464
|
-
return processed
|
|
465
|
-
```
|
|
466
|
-
|
|
467
|
-
## Performance optimization
|
|
468
|
-
|
|
469
|
-
### Process large PDFs efficiently
|
|
470
|
-
|
|
471
|
-
```python
|
|
472
|
-
def extract_tables_optimized(pdf_path):
|
|
473
|
-
"""Extract tables with memory optimization."""
|
|
474
|
-
|
|
475
|
-
import gc
|
|
476
|
-
|
|
477
|
-
results = []
|
|
478
|
-
|
|
479
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
480
|
-
for page_num, page in enumerate(pdf.pages):
|
|
481
|
-
print(f"Processing page {page_num + 1}/{len(pdf.pages)}")
|
|
482
|
-
|
|
483
|
-
# Extract tables from current page
|
|
484
|
-
tables = page.extract_tables()
|
|
485
|
-
results.extend(tables)
|
|
486
|
-
|
|
487
|
-
# Force garbage collection
|
|
488
|
-
gc.collect()
|
|
489
|
-
|
|
490
|
-
return results
|
|
491
|
-
```
|
|
492
|
-
|
|
493
|
-
### Parallel processing
|
|
494
|
-
|
|
495
|
-
```python
|
|
496
|
-
from concurrent.futures import ProcessPoolExecutor
|
|
497
|
-
import pdfplumber
|
|
498
|
-
|
|
499
|
-
def extract_page_tables(args):
|
|
500
|
-
"""Extract tables from a single page."""
|
|
501
|
-
pdf_path, page_num = args
|
|
502
|
-
|
|
503
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
504
|
-
page = pdf.pages[page_num]
|
|
505
|
-
return page.extract_tables()
|
|
506
|
-
|
|
507
|
-
def extract_tables_parallel(pdf_path, max_workers=4):
|
|
508
|
-
"""Extract tables using multiple processes."""
|
|
509
|
-
|
|
510
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
511
|
-
page_count = len(pdf.pages)
|
|
512
|
-
|
|
513
|
-
# Create tasks
|
|
514
|
-
tasks = [(pdf_path, i) for i in range(page_count)]
|
|
515
|
-
|
|
516
|
-
# Process in parallel
|
|
517
|
-
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
518
|
-
results = list(executor.map(extract_page_tables, tasks))
|
|
519
|
-
|
|
520
|
-
# Flatten results
|
|
521
|
-
all_tables = []
|
|
522
|
-
for page_tables in results:
|
|
523
|
-
all_tables.extend(page_tables)
|
|
524
|
-
|
|
525
|
-
return all_tables
|
|
526
|
-
```
|
|
527
|
-
|
|
528
|
-
## Production examples
|
|
529
|
-
|
|
530
|
-
### Example 1: Financial report extraction
|
|
531
|
-
|
|
532
|
-
```python
|
|
533
|
-
import pdfplumber
|
|
534
|
-
import pandas as pd
|
|
535
|
-
from decimal import Decimal
|
|
536
|
-
|
|
537
|
-
def extract_financial_tables(pdf_path):
|
|
538
|
-
"""Extract financial data with proper number formatting."""
|
|
539
|
-
|
|
540
|
-
tables = []
|
|
541
|
-
|
|
542
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
543
|
-
for page in pdf.pages:
|
|
544
|
-
page_tables = page.extract_tables()
|
|
545
|
-
|
|
546
|
-
for table in page_tables:
|
|
547
|
-
# Convert to DataFrame
|
|
548
|
-
df = pd.DataFrame(table[1:], columns=table[0])
|
|
549
|
-
|
|
550
|
-
# Parse currency columns
|
|
551
|
-
for col in df.columns:
|
|
552
|
-
if df[col].str.contains("$", na=False).any():
|
|
553
|
-
df[col] = df[col].str.replace(r"[$,()]", "", regex=True)
|
|
554
|
-
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
555
|
-
|
|
556
|
-
tables.append(df)
|
|
557
|
-
|
|
558
|
-
return tables
|
|
559
|
-
```
|
|
560
|
-
|
|
561
|
-
### Example 2: Batch table extraction
|
|
562
|
-
|
|
563
|
-
```python
|
|
564
|
-
import glob
|
|
565
|
-
from pathlib import Path
|
|
566
|
-
|
|
567
|
-
def batch_extract_tables(input_dir, output_dir):
|
|
568
|
-
"""Extract tables from all PDFs in directory."""
|
|
569
|
-
|
|
570
|
-
input_path = Path(input_dir)
|
|
571
|
-
output_path = Path(output_dir)
|
|
572
|
-
output_path.mkdir(exist_ok=True)
|
|
573
|
-
|
|
574
|
-
for pdf_file in input_path.glob("*.pdf"):
|
|
575
|
-
print(f"Processing: {pdf_file.name}")
|
|
576
|
-
|
|
577
|
-
try:
|
|
578
|
-
# Extract tables
|
|
579
|
-
tables = extract_all_tables(str(pdf_file))
|
|
580
|
-
|
|
581
|
-
# Export to Excel
|
|
582
|
-
output_file = output_path / f"{pdf_file.stem}_tables.xlsx"
|
|
583
|
-
export_to_excel(tables, str(output_file))
|
|
584
|
-
|
|
585
|
-
print(f" ✓ Extracted {len(tables)} table(s)")
|
|
586
|
-
|
|
587
|
-
except Exception as e:
|
|
588
|
-
print(f" ✗ Error: {e}")
|
|
589
|
-
|
|
590
|
-
# Usage
|
|
591
|
-
batch_extract_tables("invoices/", "extracted/")
|
|
592
|
-
```
|
|
593
|
-
|
|
594
|
-
## Best practices
|
|
595
|
-
|
|
596
|
-
1. **Visualize first**: Use debug mode to understand table structure
|
|
597
|
-
2. **Test settings**: Try different strategies for best results
|
|
598
|
-
3. **Handle errors**: PDFs vary widely in quality
|
|
599
|
-
4. **Validate output**: Check extracted data makes sense
|
|
600
|
-
5. **Post-process**: Clean and normalize extracted data
|
|
601
|
-
6. **Use pandas**: Leverage DataFrame operations for analysis
|
|
602
|
-
7. **Cache results**: Avoid re-processing large files
|
|
603
|
-
8. **Monitor performance**: Profile for bottlenecks
|
|
604
|
-
|
|
605
|
-
## Troubleshooting
|
|
606
|
-
|
|
607
|
-
### Tables not detected
|
|
608
|
-
|
|
609
|
-
1. Try different detection strategies
|
|
610
|
-
2. Use visual debugging to see structure
|
|
611
|
-
3. Define explicit lines manually
|
|
612
|
-
4. Check if table is actually an image
|
|
613
|
-
|
|
614
|
-
### Incorrect cell values
|
|
615
|
-
|
|
616
|
-
1. Adjust snap/join tolerance
|
|
617
|
-
2. Check text extraction quality
|
|
618
|
-
3. Use post-processing to clean data
|
|
619
|
-
4. Verify PDF is not scanned image
|
|
620
|
-
|
|
621
|
-
### Performance issues
|
|
622
|
-
|
|
623
|
-
1. Process pages individually
|
|
624
|
-
2. Use parallel processing
|
|
625
|
-
3. Reduce image resolution
|
|
626
|
-
4. Extract only needed pages
|