pdify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdify/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ from .pdify import (
2
+ create_directories,
3
+ group_words_into_lines,
4
+ split_line_into_segments,
5
+ clean_text,
6
+ reconstruct_paragraphs_from_lines,
7
+ extract_text_and_words_with_layout,
8
+ get_cohesive_figure_boxes,
9
+ process_single_page,
10
+ resolve_input_paths,
11
+ main,
12
+ )
13
+
14
+ __version__ = "0.1.0"
15
+ __all__ = [
16
+ "create_directories",
17
+ "group_words_into_lines",
18
+ "split_line_into_segments",
19
+ "clean_text",
20
+ "reconstruct_paragraphs_from_lines",
21
+ "extract_text_and_words_with_layout",
22
+ "get_cohesive_figure_boxes",
23
+ "process_single_page",
24
+ "resolve_input_paths",
25
+ "main",
26
+ ]
pdify/pdify.py ADDED
@@ -0,0 +1,672 @@
1
+ import os
2
+ import json
3
+ import argparse
4
+ import sys
5
+ import glob
6
+ import time
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+ import pdfplumber
9
+
10
+ def create_directories(base_dir="extracted_content"):
11
+ """Creates the output directory structure."""
12
+ text_dir = os.path.join(base_dir, "text")
13
+ image_dir = os.path.join(base_dir, "images")
14
+ os.makedirs(text_dir, exist_ok=True)
15
+ os.makedirs(image_dir, exist_ok=True)
16
+ return text_dir, image_dir
17
+
18
+ def group_words_into_lines(words, y_tolerance=3.0):
19
+ """Groups words into distinct lines based on vertical overlap/closeness."""
20
+ if not words:
21
+ return []
22
+
23
+ sorted_words = sorted(words, key=lambda w: w['top'])
24
+ lines = []
25
+
26
+ for w in sorted_words:
27
+ matched_line = None
28
+ # Check the last few lines to handle multi-column vertical misalignments
29
+ for line in reversed(lines[-5:]):
30
+ if abs(w['top'] - line['top_avg']) < y_tolerance:
31
+ matched_line = line
32
+ break
33
+
34
+ if matched_line:
35
+ matched_line['words'].append(w)
36
+ matched_line['top_avg'] = sum(x['top'] for x in matched_line['words']) / len(matched_line['words'])
37
+ else:
38
+ lines.append({
39
+ 'top_avg': w['top'],
40
+ 'words': [w]
41
+ })
42
+
43
+ # Sort words within each line from left to right
44
+ for line in lines:
45
+ line['words'] = sorted(line['words'], key=lambda w: w['x0'])
46
+
47
+ # Sort lines by their top coordinates
48
+ return sorted(lines, key=lambda l: l['top_avg'])
49
+
50
+ def split_line_into_segments(line, gap_threshold=15.0):
51
+ """Splits a line of words into distinct segments if a horizontal gap is larger than threshold."""
52
+ words = line['words']
53
+ if not words:
54
+ return []
55
+
56
+ segments = []
57
+ current_seg = [words[0]]
58
+
59
+ for w in words[1:]:
60
+ prev_w = current_seg[-1]
61
+ gap = w['x0'] - prev_w['x1']
62
+ if gap > gap_threshold:
63
+ segments.append(current_seg)
64
+ current_seg = [w]
65
+ else:
66
+ current_seg.append(w)
67
+ segments.append(current_seg)
68
+
69
+ formatted_segments = []
70
+ for seg in segments:
71
+ x0 = min(w['x0'] for w in seg)
72
+ x1 = max(w['x1'] for w in seg)
73
+ text = " ".join(w['text'] for w in seg)
74
+ top = min(w['top'] for w in seg)
75
+ bottom = max(w['bottom'] for w in seg)
76
+ formatted_segments.append({
77
+ 'x0': x0,
78
+ 'x1': x1,
79
+ 'top': top,
80
+ 'bottom': bottom,
81
+ 'text': text,
82
+ 'words': seg
83
+ })
84
+ return formatted_segments
85
+
86
+ def clean_text(text):
87
+ """Replaces standard unicode ligatures and cleans up common encoding noise."""
88
+ if not text:
89
+ return ""
90
+ replacements = {
91
+ "fi": "fi",
92
+ "fl": "fl",
93
+ "ff": "ff",
94
+ "ffi": "ffi",
95
+ "ffl": "ffl",
96
+ "st": "st",
97
+ "(cid:0)": "", # Strip CID-0 placeholders
98
+ }
99
+ for old, new in replacements.items():
100
+ text = text.replace(old, new)
101
+ return text
102
+
103
+ def reconstruct_paragraphs_from_lines(lines, dehyphenate=True):
104
+ """
105
+ Intelligently joins lines within a text column to form cohesive paragraphs,
106
+ using line lengths, ending punctuation, and list/bullet markers.
107
+ """
108
+ if not lines:
109
+ return ""
110
+
111
+ paragraphs = []
112
+ current_para = []
113
+
114
+ non_empty_lines = [l.strip() for l in lines if l.strip()]
115
+ if not non_empty_lines:
116
+ return ""
117
+
118
+ avg_len = sum(len(l) for l in non_empty_lines) / len(non_empty_lines)
119
+
120
+ for line in lines:
121
+ line_clean = line.strip()
122
+ if not line_clean:
123
+ continue
124
+
125
+ if not current_para:
126
+ current_para.append(line_clean)
127
+ else:
128
+ prev_line = current_para[-1]
129
+
130
+ is_hyphenated = False
131
+ if dehyphenate and prev_line.endswith("-") and len(prev_line) > 1 and prev_line[-2].isalpha():
132
+ is_hyphenated = True
133
+
134
+ is_new_para = False
135
+
136
+ # Check for list/section markers:
137
+ if line_clean.startswith(("- ", "• ", "* ", "1. ", "2. ", "3. ", "4. ", "5. ", "6. ", "7. ", "8. ", "9. ")):
138
+ is_new_para = True
139
+ elif prev_line[-1] in ".?!" and len(prev_line) < (avg_len * 0.85):
140
+ # Ends in punctuation and is short
141
+ is_new_para = True
142
+
143
+ if is_new_para and not is_hyphenated:
144
+ paragraphs.append(" ".join(current_para))
145
+ current_para = [line_clean]
146
+ else:
147
+ if is_hyphenated:
148
+ # Strip the hyphen and merge the first word directly
149
+ parts = line_clean.split(maxsplit=1)
150
+ if parts:
151
+ first_word = parts[0]
152
+ rest = parts[1] if len(parts) > 1 else ""
153
+ current_para[-1] = prev_line[:-1] + first_word
154
+ if rest:
155
+ current_para.append(rest)
156
+ else:
157
+ current_para[-1] = prev_line[:-1]
158
+ else:
159
+ current_para.append(line_clean)
160
+
161
+ if current_para:
162
+ paragraphs.append(" ".join(current_para))
163
+
164
+ return "\n\n".join(paragraphs)
165
+
166
+ def extract_text_and_words_with_layout(
167
+ page,
168
+ x_tolerance=1.5,
169
+ y_tolerance=3.0,
170
+ gap_threshold=15.0,
171
+ remove_headers_footers=True,
172
+ top_margin=50.0,
173
+ bottom_margin=50.0,
174
+ keep_page1_header=True,
175
+ exclude_figure_text=True,
176
+ figure_boxes=None,
177
+ dehyphenate=True,
178
+ reconstruct_paragraphs=True
179
+ ):
180
+ """
181
+ Extracts text and word list in correct visual reading order,
182
+ de-segmenting columns while retaining titles and section headers spans.
183
+ """
184
+ words = page.extract_words(x_tolerance=x_tolerance, y_tolerance=y_tolerance)
185
+ if not words:
186
+ return "", []
187
+
188
+ page_num = page.page_number
189
+ page_height = float(page.height)
190
+
191
+ filtered_words = []
192
+ for w in words:
193
+ w_text = clean_text(w["text"])
194
+ if not w_text.strip():
195
+ continue
196
+
197
+ w = dict(w)
198
+ w["text"] = w_text
199
+
200
+ # 1. Header and footer removal
201
+ if remove_headers_footers:
202
+ is_header = False
203
+ # For page 1, optionally keep the top margin (title/authors)
204
+ if page_num > 1 or not keep_page1_header:
205
+ if w["top"] < top_margin:
206
+ is_header = True
207
+
208
+ is_footer = w["bottom"] > (page_height - bottom_margin)
209
+
210
+ if is_header or is_footer:
211
+ continue
212
+
213
+ # 2. Exclude figure text
214
+ if exclude_figure_text and figure_boxes:
215
+ in_figure = False
216
+ w_cx = (w["x0"] + w["x1"]) / 2.0
217
+ w_cy = (w["top"] + w["bottom"]) / 2.0
218
+ for box in figure_boxes:
219
+ if box[0] <= w_cx <= box[2] and box[1] <= w_cy <= box[3]:
220
+ in_figure = True
221
+ break
222
+ if in_figure:
223
+ continue
224
+
225
+ filtered_words.append(w)
226
+
227
+ if not filtered_words:
228
+ return "", []
229
+
230
+ lines = group_words_into_lines(filtered_words, y_tolerance=y_tolerance)
231
+
232
+ blocks_text = []
233
+ blocks_words = []
234
+ active_cols = []
235
+
236
+ def flush_active_cols():
237
+ if not active_cols:
238
+ return
239
+ # Output accumulated columns from left to right
240
+ for col in sorted(active_cols, key=lambda c: c['x0']):
241
+ if col['lines_text']:
242
+ if reconstruct_paragraphs:
243
+ para_text = reconstruct_paragraphs_from_lines(col['lines_text'], dehyphenate=dehyphenate)
244
+ if para_text:
245
+ blocks_text.append(para_text)
246
+ else:
247
+ blocks_text.append("\n".join(col['lines_text']))
248
+ blocks_words.extend(col['lines_words'])
249
+ active_cols.clear()
250
+
251
+ for line in lines:
252
+ segments = split_line_into_segments(line, gap_threshold=gap_threshold)
253
+ if not segments:
254
+ continue
255
+
256
+ m = len(active_cols)
257
+ k = len(segments)
258
+ is_transition = False
259
+
260
+ if m > 0:
261
+ if m == 1 and k > 1:
262
+ # Transitioning from single column to multiple columns
263
+ is_transition = True
264
+ elif m > 1 and k == 1:
265
+ # Check if the single segment spans across/overlaps multiple active columns
266
+ seg = segments[0]
267
+ overlapping_count = 0
268
+ for col in active_cols:
269
+ overlap = max(0, min(seg['x1'], col['x1']) - max(seg['x0'], col['x0']))
270
+ if overlap > 5:
271
+ overlapping_count += 1
272
+ if overlapping_count > 1:
273
+ is_transition = True
274
+ elif m > 1 and k > 1:
275
+ # Check if any segment overlaps with multiple active columns
276
+ for seg in segments:
277
+ overlapping_count = 0
278
+ for col in active_cols:
279
+ overlap = max(0, min(seg['x1'], col['x1']) - max(seg['x0'], col['x0']))
280
+ if overlap > 5:
281
+ overlapping_count += 1
282
+ if overlapping_count > 1:
283
+ is_transition = True
284
+ break
285
+
286
+ if is_transition:
287
+ flush_active_cols()
288
+
289
+ if not active_cols:
290
+ # Create new column anchors
291
+ for seg in segments:
292
+ active_cols.append({
293
+ 'x0': seg['x0'],
294
+ 'x1': seg['x1'],
295
+ 'lines_text': [seg['text']],
296
+ 'lines_words': seg['words']
297
+ })
298
+ else:
299
+ # Match each segment to the best active column by horizontal overlap
300
+ matched_indices = set()
301
+ for seg in segments:
302
+ best_col_idx = None
303
+ max_overlap = -1
304
+ for idx, col in enumerate(active_cols):
305
+ overlap = max(0, min(seg['x1'], col['x1']) - max(seg['x0'], col['x0']))
306
+ if overlap > 0 and overlap > max_overlap:
307
+ max_overlap = overlap
308
+ best_col_idx = idx
309
+
310
+ if best_col_idx is not None and best_col_idx not in matched_indices:
311
+ col = active_cols[best_col_idx]
312
+ col['x0'] = min(col['x0'], seg['x0'])
313
+ col['x1'] = max(col['x1'], seg['x1'])
314
+ col['lines_text'].append(seg['text'])
315
+ col['lines_words'].extend(seg['words'])
316
+ matched_indices.add(best_col_idx)
317
+ else:
318
+ # If no match or already matched, treat as a new column
319
+ active_cols.append({
320
+ 'x0': seg['x0'],
321
+ 'x1': seg['x1'],
322
+ 'lines_text': [seg['text']],
323
+ 'lines_words': seg['words']
324
+ })
325
+
326
+ flush_active_cols()
327
+ return "\n\n".join(blocks_text), blocks_words
328
+
329
+ def get_cohesive_figure_boxes(page):
330
+ """
331
+ Attempts to find entire figure bounding boxes by grouping nearby image elements
332
+ or using vector/figure graphic boundaries, preventing tiny fragmented crops.
333
+ Filters out background wrappers or full-page decorative shapes.
334
+ """
335
+ raw_boxes = []
336
+
337
+ # 1. Collect from images
338
+ if page.images:
339
+ for img in page.images:
340
+ img_w = float(img["width"])
341
+ img_h = float(img["height"])
342
+ if img_w < page.width * 0.95 or img_h < page.height * 0.95:
343
+ raw_boxes.append([float(img["x0"]), float(img["top"]), float(img["x1"]), float(img["bottom"])])
344
+
345
+ # 2. Collect from figure and rect containers
346
+ containers = page.objects.get("figure", []) + page.objects.get("rect", [])
347
+ for c in containers:
348
+ width = float(c["width"])
349
+ height = float(c["height"])
350
+ if width > 50 and height > 50:
351
+ if width < page.width * 0.95 or height < page.height * 0.95:
352
+ raw_boxes.append([float(c["x0"]), float(c["top"]), float(c["x1"]), float(c["bottom"])])
353
+
354
+ if not raw_boxes:
355
+ return []
356
+
357
+ # 3. Merge overlapping or close boxes
358
+ tolerance = 15.0
359
+ merged = True
360
+ while merged:
361
+ merged = False
362
+ n = len(raw_boxes)
363
+ for i in range(n):
364
+ for j in range(i + 1, n):
365
+ b1 = raw_boxes[i]
366
+ b2 = raw_boxes[j]
367
+
368
+ # Check for 2D overlap / closeness with tolerance
369
+ is_close = not (b1[2] < b2[0] - tolerance or b1[0] > b2[2] + tolerance or
370
+ b1[3] < b2[1] - tolerance or b1[1] > b2[3] + tolerance)
371
+
372
+ if is_close:
373
+ raw_boxes[i] = [
374
+ min(b1[0], b2[0]),
375
+ min(b1[1], b2[1]),
376
+ max(b1[2], b2[2]),
377
+ max(b1[3], b2[3])
378
+ ]
379
+ raw_boxes.pop(j)
380
+ merged = True
381
+ break
382
+ if merged:
383
+ break
384
+
385
+ # 4. Filter out any remaining nested boxes
386
+ final_boxes = []
387
+ for b in raw_boxes:
388
+ is_inside = False
389
+ for other in raw_boxes:
390
+ if b is other:
391
+ continue
392
+ # Check if 'other' contains 'b' with a 2-point margin
393
+ if (other[0] - 2.0 <= b[0] and other[1] - 2.0 <= b[1] and
394
+ b[2] <= other[2] + 2.0 and b[3] <= other[3] + 2.0):
395
+ area_other = (other[2] - other[0]) * (other[3] - other[1])
396
+ area_b = (b[2] - b[0]) * (b[3] - b[1])
397
+ if area_other > area_b:
398
+ is_inside = True
399
+ break
400
+ if not is_inside:
401
+ box_tuple = tuple(b)
402
+ if box_tuple not in final_boxes:
403
+ final_boxes.append(box_tuple)
404
+
405
+ return final_boxes
406
+
407
+ def process_single_page(args_dict):
408
+ """
409
+ Processes a single page of a PDF file.
410
+ Takes a dictionary containing all parameters to avoid pickling issues.
411
+ """
412
+ pdf_path = args_dict["pdf_path"]
413
+ page_num = args_dict["page_num"]
414
+ output_base = args_dict["output_base"]
415
+ x_tolerance = args_dict["x_tolerance"]
416
+ y_tolerance = args_dict["y_tolerance"]
417
+ gap_threshold = args_dict["gap_threshold"]
418
+ resolution = args_dict["resolution"]
419
+ remove_headers_footers = args_dict["remove_headers_footers"]
420
+ top_margin = args_dict["top_margin"]
421
+ bottom_margin = args_dict["bottom_margin"]
422
+ keep_page1_header = args_dict["keep_page1_header"]
423
+ exclude_figure_text = args_dict["exclude_figure_text"]
424
+ dehyphenate = args_dict["dehyphenate"]
425
+ reconstruct_paragraphs = args_dict["reconstruct_paragraphs"]
426
+
427
+ # Sub-directory based on the PDF name
428
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
429
+ pdf_output_dir = os.path.join(output_base, pdf_name)
430
+ text_dir = os.path.join(pdf_output_dir, "text")
431
+ image_dir = os.path.join(pdf_output_dir, "images")
432
+
433
+ os.makedirs(text_dir, exist_ok=True)
434
+ os.makedirs(image_dir, exist_ok=True)
435
+
436
+ try:
437
+ with pdfplumber.open(pdf_path) as pdf:
438
+ page = pdf.pages[page_num - 1]
439
+
440
+ # 1. Get figure boxes first
441
+ figure_boxes = get_cohesive_figure_boxes(page)
442
+
443
+ # 2. Extract text layout
444
+ text_content, words = extract_text_and_words_with_layout(
445
+ page,
446
+ x_tolerance=x_tolerance,
447
+ y_tolerance=y_tolerance,
448
+ gap_threshold=gap_threshold,
449
+ remove_headers_footers=remove_headers_footers,
450
+ top_margin=top_margin,
451
+ bottom_margin=bottom_margin,
452
+ keep_page1_header=keep_page1_header,
453
+ exclude_figure_text=exclude_figure_text,
454
+ figure_boxes=figure_boxes,
455
+ dehyphenate=dehyphenate,
456
+ reconstruct_paragraphs=reconstruct_paragraphs
457
+ )
458
+
459
+ if words:
460
+ text_filename = f"page_{page_num}_text.txt"
461
+ text_path = os.path.join(text_dir, text_filename)
462
+
463
+ with open(text_path, "w", encoding="utf-8") as f:
464
+ f.write(text_content if text_content else "")
465
+
466
+ text_meta = {
467
+ "source_file": os.path.basename(pdf_path),
468
+ "page_number": page_num,
469
+ "page_width": float(page.width),
470
+ "page_height": float(page.height),
471
+ "text_file_path": text_path,
472
+ "word_coordinates": [
473
+ {
474
+ "text": w["text"],
475
+ "x0": round(float(w["x0"]), 2),
476
+ "top": round(float(w["top"]), 2),
477
+ "x1": round(float(w["x1"]), 2),
478
+ "bottom": round(float(w["bottom"]), 2)
479
+ }
480
+ for w in words
481
+ ]
482
+ }
483
+ with open(os.path.join(text_dir, f"page_{page_num}_text_metadata.json"), "w") as f:
484
+ json.dump(text_meta, f, indent=4)
485
+
486
+ # 3. Extract whole figures efficiently by rendering page once
487
+ if figure_boxes:
488
+ try:
489
+ page_img = page.to_image(resolution=resolution).original
490
+ scale_x = page_img.width / float(page.width)
491
+ scale_y = page_img.height / float(page.height)
492
+ except Exception as e:
493
+ return pdf_path, page_num, False, f"Could not render page for figure extraction: {e}"
494
+
495
+ for fig_idx, raw_bbox in enumerate(figure_boxes):
496
+ x0 = max(0, raw_bbox[0])
497
+ top = max(0, raw_bbox[1])
498
+ x1 = min(float(page.width), raw_bbox[2])
499
+ bottom = min(float(page.height), raw_bbox[3])
500
+
501
+ if x1 <= x0 or bottom <= top:
502
+ continue
503
+
504
+ bbox = (x0, top, x1, bottom)
505
+
506
+ try:
507
+ crop_box = (
508
+ int(x0 * scale_x),
509
+ int(top * scale_y),
510
+ int(x1 * scale_x),
511
+ int(bottom * scale_y)
512
+ )
513
+ pil_img = page_img.crop(crop_box)
514
+
515
+ img_filename = f"page_{page_num}_figure_{fig_idx + 1}.png"
516
+ img_path = os.path.join(image_dir, img_filename)
517
+
518
+ pil_img.save(img_path)
519
+
520
+ img_meta = {
521
+ "source_file": os.path.basename(pdf_path),
522
+ "page_number": page_num,
523
+ "figure_index": fig_idx + 1,
524
+ "image_file_path": img_path,
525
+ "bounding_box": {
526
+ "x0": round(x0, 2),
527
+ "top": round(top, 2),
528
+ "x1": round(x1, 2),
529
+ "bottom": round(bottom, 2),
530
+ "width": round(x1 - x0, 2),
531
+ "height": round(bottom - top, 2)
532
+ }
533
+ }
534
+
535
+ with open(os.path.join(image_dir, f"page_{page_num}_figure_{fig_idx + 1}_metadata.json"), "w") as f:
536
+ json.dump(img_meta, f, indent=4)
537
+ except Exception as e:
538
+ print(f" [Warning] Could not crop figure {fig_idx + 1} on page {page_num} of {pdf_name}: {e}")
539
+
540
+ return pdf_path, page_num, True, None
541
+ except Exception as e:
542
+ return pdf_path, page_num, False, str(e)
543
+
544
+ def resolve_input_paths(input_args):
545
+ """Resolves directories, globs, and direct file paths to a list of PDF file paths."""
546
+ resolved_paths = []
547
+ for pattern in input_args:
548
+ if os.path.isdir(pattern):
549
+ dir_pdfs = glob.glob(os.path.join(pattern, "*.pdf"))
550
+ resolved_paths.extend(dir_pdfs)
551
+ else:
552
+ globbed = glob.glob(pattern)
553
+ if not globbed:
554
+ if os.path.exists(pattern):
555
+ resolved_paths.append(pattern)
556
+ else:
557
+ resolved_paths.extend(globbed)
558
+
559
+ unique_paths = []
560
+ seen = set()
561
+ for p in resolved_paths:
562
+ abs_p = os.path.abspath(p)
563
+ if abs_p not in seen and os.path.isfile(abs_p) and p.lower().endswith(".pdf"):
564
+ seen.add(abs_p)
565
+ unique_paths.append(p)
566
+
567
+ return unique_paths
568
+
569
+ def main(argv=None):
570
+ parser = argparse.ArgumentParser(description="Extract text layout and cohesive figures from PDF files.")
571
+ parser.add_argument("--input", "-i", nargs="+", default=["./ALPAR.pdf"], help="Path to input PDF file(s), directory, or glob pattern (default: ./ALPAR.pdf).")
572
+ parser.add_argument("--output", "-o", default="extracted_content", help="Base directory for extracted assets (default: extracted_content).")
573
+ parser.add_argument("--x-tolerance", "-x", type=float, default=1.5, help="Horizontal character spacing tolerance (default: 1.5).")
574
+ parser.add_argument("--y-tolerance", "-y", type=float, default=3.0, help="Vertical line grouping tolerance (default: 3.0).")
575
+ parser.add_argument("--gap-threshold", "-g", type=float, default=15.0, help="Horizontal gap between columns (default: 15.0).")
576
+ parser.add_argument("--resolution", "-r", type=int, default=200, help="Resolution in DPI for cropping images (default: 200).")
577
+
578
+ # Performance Options
579
+ parser.add_argument("--workers", "-w", type=int, default=None, help="Number of parallel processes (default: min(4, CPU count)).")
580
+
581
+ # Paper Layout Options
582
+ parser.add_argument("--remove-headers-footers", action="store_true", default=True, help="Remove running headers/footers (default: True).")
583
+ parser.add_argument("--no-remove-headers-footers", dest="remove_headers_footers", action="store_false", help="Keep running headers/footers.")
584
+ parser.add_argument("--top-margin", type=float, default=50.0, help="Top margin height in points for header removal (default: 50.0).")
585
+ parser.add_argument("--bottom-margin", type=float, default=50.0, help="Bottom margin height in points for footer removal (default: 50.0).")
586
+ parser.add_argument("--keep-page1-header", action="store_true", default=True, help="Do not remove top margin text on page 1 (default: True).")
587
+ parser.add_argument("--no-keep-page1-header", dest="keep_page1_header", action="store_false", help="Remove top margin text on page 1.")
588
+ parser.add_argument("--exclude-figure-text", action="store_true", default=True, help="Exclude text inside cropped figures from main text (default: True).")
589
+ parser.add_argument("--no-exclude-figure-text", dest="exclude_figure_text", action="store_false", help="Keep text inside figures in main text.")
590
+ parser.add_argument("--dehyphenate", action="store_true", default=True, help="Join line-break hyphenated words (default: True).")
591
+ parser.add_argument("--no-dehyphenate", dest="dehyphenate", action="store_false", help="Do not join line-break hyphenated words.")
592
+ parser.add_argument("--reconstruct-paragraphs", action="store_true", default=True, help="Merge lines back into paragraphs (default: True).")
593
+ parser.add_argument("--no-reconstruct-paragraphs", dest="reconstruct_paragraphs", action="store_false", help="Do not merge lines into paragraphs.")
594
+
595
+ args = parser.parse_args(argv)
596
+
597
+ pdf_paths = resolve_input_paths(args.input)
598
+ if not pdf_paths:
599
+ print(f"Error: No PDF files found matching input patterns: {args.input}", file=sys.stderr)
600
+ return 1
601
+
602
+ print(f"Found {len(pdf_paths)} PDF file(s) to process:")
603
+ for p in pdf_paths:
604
+ print(f" - {p}")
605
+
606
+ tasks = []
607
+ for pdf_path in pdf_paths:
608
+ try:
609
+ with pdfplumber.open(pdf_path) as pdf:
610
+ num_pages = len(pdf.pages)
611
+ for page_num in range(1, num_pages + 1):
612
+ tasks.append({
613
+ "pdf_path": pdf_path,
614
+ "page_num": page_num,
615
+ "output_base": args.output,
616
+ "x_tolerance": args.x_tolerance,
617
+ "y_tolerance": args.y_tolerance,
618
+ "gap_threshold": args.gap_threshold,
619
+ "resolution": args.resolution,
620
+ "remove_headers_footers": args.remove_headers_footers,
621
+ "top_margin": args.top_margin,
622
+ "bottom_margin": args.bottom_margin,
623
+ "keep_page1_header": args.keep_page1_header,
624
+ "exclude_figure_text": args.exclude_figure_text,
625
+ "dehyphenate": args.dehyphenate,
626
+ "reconstruct_paragraphs": args.reconstruct_paragraphs
627
+ })
628
+ except Exception as e:
629
+ print(f"Error opening {pdf_path}: {e}", file=sys.stderr)
630
+
631
+ if not tasks:
632
+ print("No pages to process.", file=sys.stderr)
633
+ return 1
634
+
635
+ print(f"Total pages to process: {len(tasks)}")
636
+
637
+ workers = args.workers
638
+ if workers is None:
639
+ workers = min(4, os.cpu_count() or 1)
640
+
641
+ print(f"Running with {workers} parallel worker(s)...")
642
+
643
+ start_time = time.time()
644
+ success_count = 0
645
+ failure_count = 0
646
+
647
+ with ProcessPoolExecutor(max_workers=workers) as executor:
648
+ futures = [executor.submit(process_single_page, t) for t in tasks]
649
+ for future in as_completed(futures):
650
+ try:
651
+ pdf_path, page_num, success, err = future.result()
652
+ pdf_name = os.path.basename(pdf_path)
653
+ if success:
654
+ success_count += 1
655
+ print(f" [Success] Processed page {page_num} of {pdf_name}")
656
+ else:
657
+ failure_count += 1
658
+ print(f" [Failure] Page {page_num} of {pdf_name}: {err}", file=sys.stderr)
659
+ except Exception as e:
660
+ failure_count += 1
661
+ print(f" [Error] Worker failed: {e}", file=sys.stderr)
662
+
663
+ elapsed = time.time() - start_time
664
+ print(f"\nDone! Cohesive figures and text successfully exported to '{args.output}/'")
665
+ print(f"Completed in {elapsed:.2f} seconds.")
666
+ print(f"Successfully processed {success_count} / {len(tasks)} pages.")
667
+ if failure_count > 0:
668
+ return 1
669
+ return 0
670
+
671
+ if __name__ == "__main__":
672
+ sys.exit(main())
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdify
3
+ Version: 0.1.0
4
+ Summary: Extract text layout and cohesive figures from PDF files
5
+ Author-email: momos22251 <momos22251@gmail.com>
6
+ Project-URL: Homepage, https://github.com/momos22251/PDiFy
7
+ Project-URL: Bug Tracker, https://github.com/momos22251/PDiFy/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
12
+ Classifier: Topic :: Text Processing :: Markup
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: pdfplumber>=0.10.0
17
+ Requires-Dist: pillow>=9.0.0
18
+ Dynamic: license-file
19
+
20
+ # pdify
21
+
22
+ `pdify` is a Python package to extract clean text layout structures and cohesive figures from PDF documents (like research papers or reports), preventing fragmented crops or nested image duplication.
23
+
24
+ ## Features
25
+ - **Cohesive Figure Extraction**: Groups overlapping/nested image elements into single cohesive crops.
26
+ - **Layout Reconstruction**: Reconstructs paragraphs and column spans in correct visual reading order.
27
+ - **Hyphenation & Ligatures**: Automatically resolves word hyphens and common ligatures.
28
+ - **Parallel Processing**: Uses multi-processing to extract content concurrently.
29
+
30
+ ## Installation
31
+
32
+ You can install `pdify` directly from PyPI:
33
+ ```bash
34
+ pip install pdify
35
+ ```
36
+ *Note: Depending on your system and PDF files, you may also need `ghostscript` if rendering certain vector formats.*
37
+
38
+ ## Usage
39
+
40
+ ### Command Line Interface (CLI)
41
+
42
+ Once installed, you can use the `pdify` CLI directly:
43
+ ```bash
44
+ pdify --input path/to/document.pdf
45
+ ```
46
+ Run `pdify --help` to view all available CLI options.
47
+
48
+ ### Python API
49
+
50
+ You can also import and use `pdify` programmatically in your own Python projects:
51
+
52
+ ```python
53
+ import pdfplumber
54
+ from pdify import extract_text_and_words_with_layout, get_cohesive_figure_boxes
55
+
56
+ # Open a PDF file
57
+ with pdfplumber.open("document.pdf") as pdf:
58
+ # Process the first page
59
+ page = pdf.pages[0]
60
+
61
+ # 1. Get cohesive figure boxes (automatically merging nested crops)
62
+ figure_boxes = get_cohesive_figure_boxes(page)
63
+
64
+ # 2. Extract text layout while ignoring headers/footers and text within figure boxes
65
+ text_content, words = extract_text_and_words_with_layout(
66
+ page,
67
+ remove_headers_footers=True,
68
+ exclude_figure_text=True,
69
+ figure_boxes=figure_boxes
70
+ )
71
+
72
+ print(text_content)
73
+ ```
74
+
75
+ ## Output Structure
76
+
77
+ Outputs are saved to `extracted_content/<pdf_name>/` by default:
78
+ - **`text/`**: Clean page text (`page_N_text.txt`) and word coordinate metadata (`page_N_text_metadata.json`).
79
+ - **`images/`**: High-resolution cohesive figure crops (`page_N_figure_M.png`) and bounding box metadata (`page_N_figure_M_metadata.json`).
80
+
81
+ ## License
82
+ MIT License. See [LICENSE](LICENSE) for details.
83
+
@@ -0,0 +1,8 @@
1
+ pdify/__init__.py,sha256=grtPNWs9rtz_jWNOL1nh39iTMDQ2UYdoaMHlMyu4mvg,616
2
+ pdify/pdify.py,sha256=1IKT11dC3k_dQ6UBVbcD3BKxttBPrz3ljfYDcQlP6jY,27457
3
+ pdify-0.1.0.dist-info/licenses/LICENSE,sha256=ESYyLizI0WWtxMeS7rGVcX3ivMezm-HOd5WdeOh-9oU,1056
4
+ pdify-0.1.0.dist-info/METADATA,sha256=KJwwTFFCOsNACKVWAQBcU1rtTF4-SQH7Rc61U96COOw,2897
5
+ pdify-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
6
+ pdify-0.1.0.dist-info/entry_points.txt,sha256=e9lAwKdAEwV9-tIVopE8zBWCNHB6qhxl9tjQM3TWtCc,43
7
+ pdify-0.1.0.dist-info/top_level.txt,sha256=9rue_RnbIdAe06pPQuSUCbdzv_PXPANCR2DyeBfSDF4,6
8
+ pdify-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdify = pdify.pdify:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ pdify