doctra 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,6 @@ class VLMStructuredExtractor:
19
19
  chart = vlm.extract_chart("/abs/path/chart.jpg")
20
20
  table = vlm.extract_table("/abs/path/table.jpg")
21
21
 
22
- # Or with Anthropic:
23
22
  vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY")
24
23
  """
25
24
 
@@ -32,8 +31,6 @@ class VLMStructuredExtractor:
32
31
  ):
33
32
  """
34
33
  Initialize the VLMStructuredExtractor with provider configuration.
35
-
36
- Sets up the VLM model for structured data extraction from images.
37
34
 
38
35
  :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
39
36
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
@@ -60,8 +57,6 @@ class VLMStructuredExtractor:
60
57
  :raises Exception: If image processing or VLM call fails
61
58
  """
62
59
  try:
63
- # Normalize path and verify readability
64
- # (get_image_from_local already absolutizes & raises if missing)
65
60
  img = get_image_from_local(image_path)
66
61
  if img.mode != "RGB":
67
62
  img = img.convert("RGB")
@@ -71,15 +66,11 @@ class VLMStructuredExtractor:
71
66
 
72
67
  return result
73
68
  except Exception as e:
74
- # Re-raise so caller can handle/log too
75
69
  raise
76
70
 
77
71
  def extract_chart(self, image_path: str) -> Chart:
78
72
  """
79
73
  Extract structured chart data from an image.
80
-
81
- Uses VLM to analyze a chart image and extract the data in a structured
82
- format with title, headers, and rows.
83
74
 
84
75
  :param image_path: Path to the chart image file
85
76
  :return: Chart object containing extracted title, headers, and data rows
@@ -96,9 +87,6 @@ class VLMStructuredExtractor:
96
87
  def extract_table(self, image_path: str) -> Table:
97
88
  """
98
89
  Extract structured table data from an image.
99
-
100
- Uses VLM to analyze a table image and extract the data in a structured
101
- format with title, headers, and rows.
102
90
 
103
91
  :param image_path: Path to the table image file
104
92
  :return: Table object containing extracted title, headers, and data rows
@@ -64,22 +64,19 @@ class StructuredPDFParser:
64
64
  ):
65
65
  """
66
66
  Initialize the StructuredPDFParser with processing configuration.
67
-
68
- Sets up the layout detection engine, OCR engine, and optionally
69
- the VLM service for comprehensive document processing.
70
67
 
71
- :param use_vlm: Whether to use VLM for structured data extraction
72
- :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
68
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
69
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
73
70
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
74
- :param vlm_api_key: API key for VLM provider
75
- :param layout_model_name: Layout detection model name
76
- :param dpi: DPI for PDF rendering
77
- :param min_score: Minimum confidence score for layout detection
78
- :param ocr_lang: OCR language code
79
- :param ocr_psm: Tesseract page segmentation mode
80
- :param ocr_oem: Tesseract OCR engine mode
81
- :param ocr_extra_config: Additional Tesseract configuration
82
- :param box_separator: Separator between text boxes in output
71
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
72
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
73
+ :param dpi: DPI for PDF rendering (default: 200)
74
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
75
+ :param ocr_lang: OCR language code (default: "eng")
76
+ :param ocr_psm: Tesseract page segmentation mode (default: 4)
77
+ :param ocr_oem: Tesseract OCR engine mode (default: 3)
78
+ :param ocr_extra_config: Additional Tesseract configuration (default: "")
79
+ :param box_separator: Separator between text boxes in output (default: "\n")
83
80
  """
84
81
  self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
85
82
  self.dpi = dpi
@@ -100,15 +97,10 @@ class StructuredPDFParser:
100
97
  def parse(self, pdf_path: str) -> None:
101
98
  """
102
99
  Parse a PDF document and extract all content types.
103
-
104
- Processes the PDF through layout detection, extracts text using OCR,
105
- saves images for visual elements, and optionally converts charts/tables
106
- to structured data using VLM.
107
100
 
108
101
  :param pdf_path: Path to the input PDF file
109
102
  :return: None
110
103
  """
111
- # Extract filename without extension and create output directory
112
104
  pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
113
105
  out_dir = f"outputs/{pdf_filename}/full_parse"
114
106
 
@@ -120,7 +112,6 @@ class StructuredPDFParser:
120
112
  )
121
113
  pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
122
114
 
123
- # Count for progress bars
124
115
  fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
125
116
  chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
126
117
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
@@ -133,11 +124,8 @@ class StructuredPDFParser:
133
124
  figures_desc = "Figures (cropped)"
134
125
 
135
126
  with ExitStack() as stack:
136
- # Enhanced environment detection
137
127
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
138
128
  is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
139
-
140
- # Use appropriate progress bars based on environment
141
129
  if is_notebook:
142
130
  charts_bar = stack.enter_context(
143
131
  create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -165,13 +153,11 @@ class StructuredPDFParser:
165
153
  rel = os.path.relpath(abs_img_path, out_dir)
166
154
 
167
155
  if box.label == "figure":
168
- # Figures are always images in MD
169
156
  md_lines.append(f"![Figure — page {page_num}]({rel})\n")
170
157
  if figures_bar: figures_bar.update(1)
171
158
 
172
159
  elif box.label == "chart":
173
160
  if self.use_vlm and self.vlm:
174
- # Try structured → Markdown table; fallback to image if it fails
175
161
  wrote_table = False
176
162
  try:
177
163
  chart = self.vlm.extract_chart(abs_img_path)
@@ -193,7 +179,6 @@ class StructuredPDFParser:
193
179
 
194
180
  elif box.label == "table":
195
181
  if self.use_vlm and self.vlm:
196
- # Try structured → Markdown table; fallback to image if it fails
197
182
  wrote_table = False
198
183
  try:
199
184
  table = self.vlm.extract_table(abs_img_path)
@@ -229,7 +214,6 @@ class StructuredPDFParser:
229
214
  html_structured_path = os.path.join(out_dir, "tables.html")
230
215
  write_structured_html(html_structured_path, structured_items)
231
216
 
232
- # Print completion message with output directory
233
217
  print(f"✅ Parsing completed successfully!")
234
218
  print(f"📁 Output directory: {out_dir}")
235
219
 
@@ -249,30 +233,25 @@ class StructuredPDFParser:
249
233
  :param save_path: Optional path to save the visualization (if None, displays only)
250
234
  :return: None
251
235
  """
252
- # Get layout predictions
253
236
  pages: List[LayoutPage] = self.layout_engine.predict_pdf(
254
237
  pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
255
238
  )
256
239
  pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
257
240
 
258
- # Limit to requested number of pages
259
241
  pages_to_show = min(num_pages, len(pages))
260
242
 
261
243
  if pages_to_show == 0:
262
244
  print("No pages to display")
263
245
  return
264
246
 
265
- # Calculate grid dimensions
266
247
  rows = (pages_to_show + cols - 1) // cols
267
248
 
268
- # Collect unique labels from the processed pages and assign colors
269
249
  used_labels = set()
270
250
  for idx in range(pages_to_show):
271
251
  page = pages[idx]
272
252
  for box in page.boxes:
273
253
  used_labels.add(box.label.lower())
274
254
 
275
- # Create dynamic color assignment for all detected labels
276
255
  base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
277
256
  '#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
278
257
  '#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
@@ -281,22 +260,18 @@ class StructuredPDFParser:
281
260
  for i, label in enumerate(sorted(used_labels)):
282
261
  dynamic_label_colors[label] = base_colors[i % len(base_colors)]
283
262
 
284
- # Process each page and add bounding boxes
285
263
  processed_pages = []
286
264
 
287
265
  for idx in range(pages_to_show):
288
266
  page = pages[idx]
289
267
  page_img = pil_pages[idx].copy()
290
268
 
291
- # Calculate scale factor to resize to target width
292
269
  scale_factor = page_width / page_img.width
293
270
  new_height = int(page_img.height * scale_factor)
294
271
  page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
295
272
 
296
- # Create drawing context
297
273
  draw = ImageDraw.Draw(page_img)
298
274
 
299
- # Try to load a nice font, fallback to default
300
275
  try:
301
276
  font = ImageFont.truetype("arial.ttf", 24)
302
277
  small_font = ImageFont.truetype("arial.ttf", 18)
@@ -308,21 +283,16 @@ class StructuredPDFParser:
308
283
  font = None
309
284
  small_font = None
310
285
 
311
- # Draw bounding boxes
312
286
  for box in page.boxes:
313
- # Scale coordinates
314
287
  x1 = int(box.x1 * scale_factor)
315
288
  y1 = int(box.y1 * scale_factor)
316
289
  x2 = int(box.x2 * scale_factor)
317
290
  y2 = int(box.y2 * scale_factor)
318
291
 
319
- # Get color for this label from dynamic assignment
320
292
  color = dynamic_label_colors.get(box.label.lower(), '#000000')
321
293
 
322
- # Draw rectangle with rounded corners effect
323
294
  draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
324
295
 
325
- # Draw label background
326
296
  label_text = f"{box.label} ({box.score:.2f})"
327
297
  if font:
328
298
  bbox = draw.textbbox((0, 0), label_text, font=small_font)
@@ -332,11 +302,9 @@ class StructuredPDFParser:
332
302
  text_width = len(label_text) * 8
333
303
  text_height = 15
334
304
 
335
- # Position label above the box
336
305
  label_x = x1
337
306
  label_y = max(0, y1 - text_height - 8)
338
307
 
339
- # Draw label background with padding
340
308
  padding = 4
341
309
  draw.rectangle([
342
310
  label_x - padding,
@@ -345,10 +313,8 @@ class StructuredPDFParser:
345
313
  label_y + text_height + padding
346
314
  ], fill='white', outline=color, width=2)
347
315
 
348
- # Draw label text
349
316
  draw.text((label_x, label_y), label_text, fill=color, font=small_font)
350
317
 
351
- # Add page title
352
318
  title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
353
319
  if font:
354
320
  title_bbox = draw.textbbox((0, 0), title_text, font=font)
@@ -356,7 +322,6 @@ class StructuredPDFParser:
356
322
  else:
357
323
  title_width = len(title_text) * 12
358
324
 
359
- # Draw title background
360
325
  title_x = (page_width - title_width) // 2
361
326
  title_y = 10
362
327
  draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
@@ -365,16 +330,13 @@ class StructuredPDFParser:
365
330
 
366
331
  processed_pages.append(page_img)
367
332
 
368
- # Create grid layout with space for legend
369
333
  legend_width = 250
370
334
  grid_width = cols * page_width + (cols - 1) * spacing
371
335
  total_width = grid_width + legend_width + spacing
372
336
  grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
373
337
 
374
- # Create final grid image with modern background
375
338
  final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
376
339
 
377
- # Place pages in grid
378
340
  for idx, page_img in enumerate(processed_pages):
379
341
  row = idx // cols
380
342
  col = idx % cols
@@ -384,13 +346,11 @@ class StructuredPDFParser:
384
346
 
385
347
  final_img.paste(page_img, (x_pos, y_pos))
386
348
 
387
- # Create legend
388
349
  legend_x = grid_width + spacing
389
350
  legend_y = 20
390
351
 
391
352
  draw_legend = ImageDraw.Draw(final_img)
392
353
 
393
- # Legend title
394
354
  legend_title = "Element Types"
395
355
  if font:
396
356
  title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
@@ -400,47 +360,38 @@ class StructuredPDFParser:
400
360
  title_width = len(legend_title) * 12
401
361
  title_height = 20
402
362
 
403
- # Draw legend background
404
363
  legend_bg_height = len(used_labels) * 35 + title_height + 40
405
364
  draw_legend.rectangle([legend_x - 10, legend_y - 10,
406
365
  legend_x + legend_width - 10, legend_y + legend_bg_height],
407
366
  fill='white', outline='#E5E7EB', width=2)
408
367
 
409
- # Draw legend title
410
368
  draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
411
369
  fill='#1F2937', font=font)
412
370
 
413
- # Draw legend items - now using dynamic colors for actually detected labels
414
371
  current_y = legend_y + title_height + 20
415
372
 
416
373
  for label in sorted(used_labels):
417
374
  color = dynamic_label_colors[label]
418
375
 
419
- # Draw color square
420
376
  square_size = 20
421
377
  draw_legend.rectangle([legend_x + 10, current_y,
422
378
  legend_x + 10 + square_size, current_y + square_size],
423
379
  fill=color, outline='#6B7280', width=1)
424
380
 
425
- # Draw label text
426
381
  draw_legend.text((legend_x + 40, current_y + 2), label.title(),
427
382
  fill='#374151', font=small_font)
428
383
 
429
384
  current_y += 30
430
385
 
431
- # Save or display
432
386
  if save_path:
433
387
  final_img.save(save_path, quality=95, optimize=True)
434
388
  print(f"Layout visualization saved to: {save_path}")
435
389
  else:
436
- # Display using PIL's default viewer
437
390
  final_img.show()
438
391
 
439
- # Print summary statistics
440
392
  print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
441
393
  print(f"Pages processed: {pages_to_show}")
442
394
 
443
- # Create summary by label across all pages
444
395
  total_counts = {}
445
396
  for idx in range(pages_to_show):
446
397
  page = pages[idx]
@@ -61,22 +61,17 @@ class ChartTablePDFParser:
61
61
  ):
62
62
  """
63
63
  Initialize the ChartTablePDFParser with extraction configuration.
64
-
65
- Sets up the layout detection engine and optionally the VLM service
66
- for structured data extraction.
67
64
 
68
- :param extract_charts: Whether to extract charts from the document
69
- :param extract_tables: Whether to extract tables from the document
70
- :param use_vlm: Whether to use VLM for structured data extraction
71
- :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
65
+ :param extract_charts: Whether to extract charts from the document (default: True)
66
+ :param extract_tables: Whether to extract tables from the document (default: True)
67
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
68
+ :param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
72
69
  :param vlm_model: Model name to use (defaults to provider-specific defaults)
73
- :param vlm_api_key: API key for VLM provider
74
- :param layout_model_name: Layout detection model name
75
- :param dpi: DPI for PDF rendering
76
- :param min_score: Minimum confidence score for layout detection
77
- :raises ValueError: If neither extract_charts nor extract_tables is True
70
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
71
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
72
+ :param dpi: DPI for PDF rendering (default: 200)
73
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
78
74
  """
79
- # Validation
80
75
  if not extract_charts and not extract_tables:
81
76
  raise ValueError("At least one of extract_charts or extract_tables must be True")
82
77
 
@@ -98,21 +93,15 @@ class ChartTablePDFParser:
98
93
  def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
99
94
  """
100
95
  Parse a PDF document and extract charts and/or tables.
101
-
102
- Processes the PDF through layout detection, extracts the specified
103
- element types, saves cropped images, and optionally converts them
104
- to structured data using VLM.
105
96
 
106
97
  :param pdf_path: Path to the input PDF file
107
98
  :param output_base_dir: Base directory for output files (default: "outputs")
108
99
  :return: None
109
100
  """
110
- # Create output directory structure: outputs/<filename>/structured_parsing/
111
101
  pdf_name = Path(pdf_path).stem
112
102
  out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
113
103
  os.makedirs(out_dir, exist_ok=True)
114
104
 
115
- # Create subdirectories based on what we're extracting
116
105
  charts_dir = None
117
106
  tables_dir = None
118
107
 
@@ -129,24 +118,20 @@ class ChartTablePDFParser:
129
118
  )
130
119
  pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
131
120
 
132
- # Determine which labels to extract
133
121
  target_labels = []
134
122
  if self.extract_charts:
135
123
  target_labels.append("chart")
136
124
  if self.extract_tables:
137
125
  target_labels.append("table")
138
126
 
139
- # Count items for progress bars
140
127
  chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
141
128
  table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
142
129
 
143
- # Prepare output content
144
130
  if self.use_vlm:
145
131
  md_lines: List[str] = ["# Extracted Charts and Tables\n"]
146
132
  structured_items: List[Dict[str, Any]] = []
147
133
  vlm_items: List[Dict[str, Any]] = []
148
134
 
149
- # Progress bar descriptions
150
135
  charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
151
136
  tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
152
137
 
@@ -154,11 +139,9 @@ class ChartTablePDFParser:
154
139
  table_counter = 1
155
140
 
156
141
  with ExitStack() as stack:
157
- # Enhanced environment detection
158
142
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
159
143
  is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
160
144
 
161
- # Use appropriate progress bars based on environment
162
145
  if is_notebook:
163
146
  charts_bar = stack.enter_context(
164
147
  create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
@@ -174,23 +157,19 @@ class ChartTablePDFParser:
174
157
  page_num = p.page_index
175
158
  page_img: Image.Image = pil_pages[page_num - 1]
176
159
 
177
- # Only process selected item types
178
160
  target_items = [box for box in p.boxes if box.label in target_labels]
179
161
 
180
162
  if target_items and self.use_vlm:
181
163
  md_lines.append(f"\n## Page {page_num}\n")
182
164
 
183
165
  for box in sorted(target_items, key=reading_order_key):
184
- # Handle charts
185
166
  if box.label == "chart" and self.extract_charts:
186
167
  chart_filename = f"chart_{chart_counter:03d}.png"
187
168
  chart_path = os.path.join(charts_dir, chart_filename)
188
169
 
189
- # Save image
190
170
  cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
191
171
  cropped_img.save(chart_path)
192
172
 
193
- # Handle VLM processing if enabled
194
173
  if self.use_vlm and self.vlm:
195
174
  rel_path = os.path.join("charts", chart_filename)
196
175
  wrote_table = False
@@ -227,16 +206,13 @@ class ChartTablePDFParser:
227
206
  if charts_bar:
228
207
  charts_bar.update(1)
229
208
 
230
- # Handle tables
231
209
  elif box.label == "table" and self.extract_tables:
232
210
  table_filename = f"table_{table_counter:03d}.png"
233
211
  table_path = os.path.join(tables_dir, table_filename)
234
212
 
235
- # Save image
236
213
  cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
237
214
  cropped_img.save(table_path)
238
215
 
239
- # Handle VLM processing if enabled
240
216
  if self.use_vlm and self.vlm:
241
217
  rel_path = os.path.join("tables", table_filename)
242
218
  wrote_table = False
@@ -273,19 +249,11 @@ class ChartTablePDFParser:
273
249
  if tables_bar:
274
250
  tables_bar.update(1)
275
251
 
276
- # Write outputs only if VLM is used
277
- md_path = None
278
252
  excel_path = None
279
253
 
280
254
  if self.use_vlm:
281
- # Write markdown file
282
- md_path = os.path.join(out_dir, "charts.md")
283
- with open(md_path, 'w', encoding='utf-8') as f:
284
- f.write('\n'.join(md_lines))
285
255
 
286
- # Write Excel file if we have structured data
287
256
  if structured_items:
288
- # Determine Excel filename based on extraction target
289
257
  if self.extract_charts and self.extract_tables:
290
258
  excel_filename = "parsed_tables_charts.xlsx"
291
259
  elif self.extract_charts:
@@ -299,23 +267,19 @@ class ChartTablePDFParser:
299
267
  excel_path = os.path.join(out_dir, excel_filename)
300
268
  write_structured_excel(excel_path, structured_items)
301
269
 
302
- # Also create HTML version
303
270
  html_filename = excel_filename.replace('.xlsx', '.html')
304
271
  html_path = os.path.join(out_dir, html_filename)
305
272
  write_structured_html(html_path, structured_items)
306
273
 
307
- # Write VLM items mapping for UI linkage
308
274
  if 'vlm_items' in locals() and vlm_items:
309
275
  with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
310
276
  json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
311
277
 
312
- # Print results
313
278
  extraction_types = []
314
279
  if self.extract_charts:
315
280
  extraction_types.append("charts")
316
281
  if self.extract_tables:
317
282
  extraction_types.append("tables")
318
283
 
319
- # Print completion message with output directory
320
284
  print(f"✅ Parsing completed successfully!")
321
285
  print(f"📁 Output directory: {out_dir}")
doctra/ui/app.py CHANGED
@@ -17,13 +17,10 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
17
17
 
18
18
  if out_dir.exists():
19
19
  if is_structured_parsing:
20
- # For structured parsing, show ALL files in the directory
21
20
  for file_path in sorted(out_dir.rglob("*")):
22
21
  if file_path.is_file():
23
22
  file_paths.append(str(file_path))
24
23
  else:
25
- # For full parsing, use the original logic
26
- # Always add main output files (HTML, Markdown, etc.) regardless of allowed_kinds
27
24
  main_files = [
28
25
  "result.html",
29
26
  "result.md",
@@ -36,22 +33,18 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
36
33
  if file_path.exists():
37
34
  file_paths.append(str(file_path))
38
35
 
39
- # Add image files based on allowed_kinds or all images if not specified
40
36
  if allowed_kinds:
41
37
  for kind in allowed_kinds:
42
- # ChartTablePDFParser saves directly to charts/ and tables/ directories
43
38
  p = out_dir / kind
44
39
  if p.exists():
45
- for img in sorted(p.glob("*.png")): # ChartTablePDFParser saves as .png
40
+ for img in sorted(p.glob("*.png")):
46
41
  file_paths.append(str(img))
47
42
 
48
- # Also check images/ subdirectories (for StructuredPDFParser)
49
43
  images_dir = out_dir / "images" / kind
50
44
  if images_dir.exists():
51
- for img in sorted(images_dir.glob("*.jpg")): # StructuredPDFParser saves as .jpg
45
+ for img in sorted(images_dir.glob("*.jpg")):
52
46
  file_paths.append(str(img))
53
47
  else:
54
- # Fallback: look in both direct directories and images/ subdirectories
55
48
  for p in (out_dir / "charts").glob("*.png"):
56
49
  file_paths.append(str(p))
57
50
  for p in (out_dir / "tables").glob("*.png"):
@@ -59,7 +52,6 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
59
52
  for p in (out_dir / "images").rglob("*.jpg"):
60
53
  file_paths.append(str(p))
61
54
 
62
- # Add Excel files based on extraction target (for structured parsing)
63
55
  if allowed_kinds:
64
56
  if "charts" in allowed_kinds and "tables" in allowed_kinds:
65
57
  excel_files = ["parsed_tables_charts.xlsx"]
@@ -77,30 +69,24 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
77
69
 
78
70
  kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
79
71
  for sub in kinds:
80
- # Look in both direct directories and images/ subdirectories
81
- # First try direct directories (for ChartTablePDFParser)
82
72
  p = out_dir / sub
83
73
  if p.exists():
84
- for img in sorted(p.glob("*.png")): # ChartTablePDFParser saves as .png
74
+ for img in sorted(p.glob("*.png")):
85
75
  gallery_items.append((str(img), f"{sub}: {img.name}"))
86
76
 
87
- # Also try images/ subdirectories (for StructuredPDFParser)
88
77
  images_dir = out_dir / "images" / sub
89
78
  if images_dir.exists():
90
- for img in sorted(images_dir.glob("*.jpg")): # StructuredPDFParser saves as .jpg
79
+ for img in sorted(images_dir.glob("*.jpg")):
91
80
  gallery_items.append((str(img), f"{sub}: {img.name}"))
92
81
 
93
82
  tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
94
83
 
95
- # Use custom filename if provided, otherwise use default
96
84
  if zip_filename:
97
- # Clean the filename to be safe for file systems
98
85
  safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
99
86
  zip_base = tmp_zip_dir / safe_filename
100
87
  else:
101
88
  zip_base = tmp_zip_dir / "doctra_outputs"
102
89
 
103
- # Create a filtered copy of the output directory excluding temp files
104
90
  filtered_dir = tmp_zip_dir / "filtered_outputs"
105
91
  shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
106
92
 
@@ -125,13 +111,10 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
125
111
  while i < len(lines):
126
112
  line = lines[i].strip()
127
113
 
128
- # Check for page header
129
114
  if line.startswith('## Page '):
130
- # Save previous page if exists
131
115
  if current_page:
132
116
  pages.append(current_page)
133
117
 
134
- # Start new page
135
118
  page_num = line.replace('## Page ', '').strip()
136
119
  current_page = {
137
120
  'page_num': page_num,
@@ -145,15 +128,12 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
145
128
  i += 1
146
129
  continue
147
130
 
148
- # Check for images (tables, charts, figures)
149
131
  if line.startswith('![') and '](images/' in line:
150
- # Extract image info
151
132
  match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', line)
152
133
  if match:
153
134
  caption = match.group(1)
154
135
  img_path = match.group(2)
155
136
 
156
- # Categorize by type
157
137
  if 'Table' in caption:
158
138
  current_page['tables'].append({'caption': caption, 'path': img_path})
159
139
  elif 'Chart' in caption:
@@ -163,18 +143,15 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
163
143
 
164
144
  current_page['images'].append({'caption': caption, 'path': img_path})
165
145
 
166
- # Add to full content with proper markdown formatting
167
146
  current_page['full_content'].append(f"![{caption}]({img_path})")
168
147
 
169
- # Regular content
170
148
  elif current_page:
171
- if line: # Only add non-empty lines
149
+ if line:
172
150
  current_page['content'].append(line)
173
151
  current_page['full_content'].append(line)
174
152
 
175
153
  i += 1
176
154
 
177
- # Add the last page
178
155
  if current_page:
179
156
  pages.append(current_page)
180
157
 
@@ -198,12 +175,9 @@ def run_full_parse(
198
175
  if not pdf_file:
199
176
  return ("No file provided.", None, [], [], "")
200
177
 
201
- # Extract filename from the uploaded file path
202
- # Gradio provides the original filename in the file path
203
178
  original_filename = Path(pdf_file).stem
204
179
 
205
180
  tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
206
- # Use original filename for temp file so parser creates correct output directory
207
181
  input_pdf = tmp_dir / f"{original_filename}.pdf"
208
182
  shutil.copy2(pdf_file, input_pdf)
209
183
 
@@ -295,7 +269,6 @@ def run_extract(
295
269
  original_filename = Path(pdf_file).stem
296
270
 
297
271
  tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
298
- # Use original filename for temp file so parser creates correct output directory
299
272
  input_pdf = tmp_dir / f"{original_filename}.pdf"
300
273
  shutil.copy2(pdf_file, input_pdf)
301
274
 
doctra/utils/progress.py CHANGED
@@ -40,7 +40,6 @@ def _detect_environment() -> Tuple[bool, bool, bool]:
40
40
  Returns (is_notebook, is_tty, is_windows).
41
41
  """
42
42
  is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
43
- # Colab/Kaggle specifics
44
43
  if "google.colab" in sys.modules:
45
44
  is_notebook = True
46
45
  if "kaggle_secrets" in sys.modules or "kaggle_web_client" in sys.modules:
@@ -59,7 +58,6 @@ def _select_emoji(key: str) -> str:
59
58
  - ascii: ASCII text tokens
60
59
  - none: empty prefix
61
60
  """
62
- # Maps
63
61
  default_map = {
64
62
  "loading": "🔄",
65
63
  "charts": "📊",
@@ -70,14 +68,13 @@ def _select_emoji(key: str) -> str:
70
68
  "processing": "⚙️",
71
69
  }
72
70
  safe_map = {
73
- # Use BMP or geometric shapes likely to render everywhere
74
71
  "loading": "⏳",
75
72
  "charts": "▦",
76
73
  "tables": "▤",
77
74
  "figures": "▧",
78
75
  "ocr": "🔎",
79
76
  "vlm": "★",
80
- "processing": "⚙", # no variation selector
77
+ "processing": "⚙",
81
78
  }
82
79
  ascii_map = {
83
80
  "loading": "[loading]",
@@ -89,13 +86,11 @@ def _select_emoji(key: str) -> str:
89
86
  "processing": "[processing]",
90
87
  }
91
88
 
92
- # Determine effective mode
93
89
  mode = _PROGRESS_CONFIG.emoji_mode
94
90
  is_notebook, _, is_windows = _detect_environment()
95
91
  if not _PROGRESS_CONFIG.use_emoji:
96
92
  mode = "none"
97
93
  elif mode == "default":
98
- # Heuristics: prefer safe in Colab/Kaggle notebooks and Windows terminals
99
94
  if is_windows or "google.colab" in sys.modules or "kaggle_secrets" in sys.modules:
100
95
  mode = "safe"
101
96
 
@@ -105,7 +100,6 @@ def _select_emoji(key: str) -> str:
105
100
  return ascii_map.get(key, "")
106
101
  if mode == "safe":
107
102
  return safe_map.get(key, safe_map["processing"])
108
- # default
109
103
  return default_map.get(key, default_map["processing"])
110
104
 
111
105
 
@@ -119,17 +113,13 @@ def _supports_unicode_output() -> bool:
119
113
  except Exception:
120
114
  pass
121
115
 
122
- # Heuristics for common notebook environments that support emoji
123
116
  env = os.environ
124
117
  if any(k in env for k in ("COLAB_GPU", "GCE_METADATA_HOST", "KAGGLE_KERNEL_RUN_TYPE", "JPY_PARENT_PID")):
125
118
  return True
126
119
 
127
- # On modern Windows terminals with UTF-8 code page, assume yes
128
120
  if sys.platform.startswith("win"):
129
- # If user opted-in to force ASCII, respect it
130
121
  if _PROGRESS_CONFIG.force_ascii:
131
122
  return False
132
- # Try to detect WT/Terminal/VSCode which usually handle Unicode
133
123
  if any(k in env for k in ("WT_SESSION", "TERM_PROGRAM", "VSCODE_PID")):
134
124
  return True
135
125
 
@@ -161,19 +151,15 @@ def create_beautiful_progress_bar(
161
151
  :return: Configured tqdm progress bar instance
162
152
  """
163
153
 
164
- # Enhanced styling parameters - notebook-friendly format
165
154
  is_notebook, is_tty, is_windows = _detect_environment()
166
155
  if is_notebook:
167
- # Simpler format for notebooks to avoid display issues
168
156
  bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
169
157
  else:
170
- # Full format for terminal
171
158
  bar_format = (
172
159
  "{l_bar}{bar:30}| {n_fmt}/{total_fmt} "
173
160
  "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
174
161
  )
175
162
 
176
- # Color schemes based on operation type
177
163
  color_schemes = {
178
164
  "loading": {"colour": "cyan", "ncols": 100},
179
165
  "charts": {"colour": "green", "ncols": 100},
@@ -184,7 +170,6 @@ def create_beautiful_progress_bar(
184
170
  "processing": {"colour": "white", "ncols": 100},
185
171
  }
186
172
 
187
- # Determine color scheme based on description
188
173
  desc_lower = desc.lower()
189
174
  if "loading" in desc_lower or "model" in desc_lower:
190
175
  color_scheme = color_schemes["loading"]
@@ -201,45 +186,37 @@ def create_beautiful_progress_bar(
201
186
  else:
202
187
  color_scheme = color_schemes["processing"]
203
188
 
204
- # Emoji categories
205
189
  emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
206
190
 
207
- # Add appropriate emoji to description (can be disabled)
208
191
  if _PROGRESS_CONFIG.use_emoji:
209
192
  prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
210
193
  prefix = _select_emoji(prefix_key)
211
194
  if prefix:
212
195
  desc = f"{prefix} {desc}"
213
196
 
214
- # Enhanced tqdm configuration
215
197
  tqdm_config = {
216
198
  "total": total,
217
199
  "desc": desc,
218
200
  "leave": leave,
219
201
  "bar_format": bar_format,
220
202
  "ncols": _PROGRESS_CONFIG.ncols_env or color_scheme["ncols"],
221
- # Prefer Unicode unless user forces ASCII or environment lacks Unicode support
222
203
  "ascii": _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output(),
223
- "dynamic_ncols": True, # Responsive width
224
- "smoothing": 0.3, # Smooth progress updates
225
- "mininterval": 0.1, # Minimum update interval
226
- "maxinterval": 1.0, # Maximum update interval
204
+ "dynamic_ncols": True,
205
+ "smoothing": 0.3,
206
+ "mininterval": 0.1,
207
+ "maxinterval": 1.0,
227
208
  "position": position,
228
209
  **kwargs
229
210
  }
230
211
 
231
- # Enhanced environment detection
232
212
  is_notebook, is_terminal, is_windows = _detect_environment()
233
213
 
234
- # Add color only for terminal environments (not notebooks)
235
214
  if not is_notebook and is_terminal:
236
215
  tqdm_config["colour"] = color_scheme["colour"]
237
216
 
238
- # Respect global disable
239
217
  if _PROGRESS_CONFIG.disable:
240
218
  tqdm_config["disable"] = True
241
219
 
242
- # Try creating the progress bar with Unicode, fallback to ASCII on failure (e.g., Windows code page)
243
220
  if is_notebook:
244
221
  tqdm_config.pop("colour", None)
245
222
  try:
@@ -297,7 +274,6 @@ def update_progress_with_info(
297
274
  :param info: Optional dictionary of information to display
298
275
  """
299
276
  if info:
300
- # Format info as postfix
301
277
  postfix_parts = []
302
278
  for key, value in info.items():
303
279
  if isinstance(value, float):
@@ -354,54 +330,22 @@ def create_notebook_friendly_bar(
354
330
  **kwargs
355
331
  ) -> tqdm:
356
332
  """
357
- Create a notebook-friendly progress bar with minimal formatting.
333
+ Create a notebook-friendly progress bar with consistent sizing.
358
334
 
359
- This function creates progress bars specifically optimized for Jupyter notebooks
360
- to avoid display issues and ANSI code problems.
335
+ This function creates progress bars that match the main progress bar
336
+ styling and behavior in notebook environments.
361
337
 
362
338
  :param total: Total number of items to process
363
339
  :param desc: Description text for the progress bar
364
340
  :param kwargs: Additional tqdm parameters
365
341
  :return: Configured notebook-friendly progress bar
366
342
  """
367
- # Force notebook mode
368
- if _PROGRESS_CONFIG.disable:
369
- kwargs["disable"] = True
370
- else:
371
- kwargs["disable"] = False
372
- # Prefer Unicode in notebooks if supported
373
- if "ascii" not in kwargs:
374
- kwargs["ascii"] = _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output()
375
-
376
- # Emoji categories
377
- emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
378
-
379
- # Add appropriate emoji to description
380
- desc_lower = desc.lower()
381
- if _PROGRESS_CONFIG.use_emoji:
382
- prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
383
- prefix = _select_emoji(prefix_key)
384
- if prefix:
385
- desc = f"{prefix} {desc}"
386
-
387
- # Simple format for notebooks
388
- bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}"
389
-
390
- tqdm_config = {
391
- "total": total,
392
- "desc": desc,
393
- "leave": True,
394
- "bar_format": bar_format,
395
- "ncols": _PROGRESS_CONFIG.ncols_env or 80,
396
- "ascii": kwargs.get("ascii", False),
397
- "dynamic_ncols": False, # Fixed width for notebooks
398
- "smoothing": 0.1, # Faster updates
399
- "mininterval": 0.05,
400
- "maxinterval": 0.5,
343
+ return create_beautiful_progress_bar(
344
+ total=total,
345
+ desc=desc,
346
+ leave=True,
401
347
  **kwargs
402
- }
403
-
404
- return tqdm_auto(**tqdm_config)
348
+ )
405
349
 
406
350
 
407
351
  def progress_for(iterable: Iterable[Any], desc: str, total: Optional[int] = None, leave: bool = True, **kwargs) -> Iterator[Any]:
@@ -1,49 +1,45 @@
1
- from __future__ import annotations
2
- from typing import Any, Dict, Optional
3
- import json
4
-
5
- try:
6
- from pydantic import BaseModel # type: ignore
7
- except Exception: # pydantic not strictly required for normalization
8
- class BaseModel: # fallback stub
9
- pass
10
-
11
- def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
12
- """
13
- Accepts a VLM result that might be:
14
- - JSON string
15
- - dict
16
- - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
17
- Returns a normalized dict with keys: title, headers, rows — or None.
18
- """
19
- if obj is None:
20
- return None
21
-
22
- # JSON string from VLM
23
- if isinstance(obj, str):
24
- try:
25
- obj = json.loads(obj)
26
- except Exception:
27
- return None
28
-
29
- # Pydantic model
30
- if isinstance(obj, BaseModel):
31
- try:
32
- return obj.model_dump() # pydantic v2
33
- except Exception:
34
- try:
35
- return obj.dict() # pydantic v1
36
- except Exception:
37
- return None
38
-
39
- # Plain dict
40
- if isinstance(obj, dict):
41
- title = obj.get("title") or "Untitled"
42
- headers = obj.get("headers") or []
43
- rows = obj.get("rows") or []
44
- # Basic shape checks
45
- if not isinstance(headers, list) or not isinstance(rows, list):
46
- return None
47
- return {"title": title, "headers": headers, "rows": rows}
48
-
49
- return None
1
+ from __future__ import annotations
2
+ from typing import Any, Dict, Optional
3
+ import json
4
+
5
+ try:
6
+ from pydantic import BaseModel # type: ignore
7
+ except Exception:
8
+ class BaseModel:
9
+ pass
10
+
11
+ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
12
+ """
13
+ Accepts a VLM result that might be:
14
+ - JSON string
15
+ - dict
16
+ - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
17
+ Returns a normalized dict with keys: title, headers, rows — or None.
18
+ """
19
+ if obj is None:
20
+ return None
21
+
22
+ if isinstance(obj, str):
23
+ try:
24
+ obj = json.loads(obj)
25
+ except Exception:
26
+ return None
27
+
28
+ if isinstance(obj, BaseModel):
29
+ try:
30
+ return obj.model_dump()
31
+ except Exception:
32
+ try:
33
+ return obj.dict()
34
+ except Exception:
35
+ return None
36
+
37
+ if isinstance(obj, dict):
38
+ title = obj.get("title") or "Untitled"
39
+ headers = obj.get("headers") or []
40
+ rows = obj.get("rows") or []
41
+ if not isinstance(headers, list) or not isinstance(rows, list):
42
+ return None
43
+ return {"title": title, "headers": headers, "rows": rows}
44
+
45
+ return None
doctra/version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  """Version information for Doctra."""
2
- __version__ = '0.3.1'
2
+ __version__ = '0.3.3'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doctra
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Parse, extract, and analyze documents with ease
5
5
  Home-page: https://github.com/AdemBoukhris457/Doctra
6
6
  Author: Adem Boukhris
@@ -1,5 +1,5 @@
1
1
  doctra/__init__.py,sha256=ST_c2GWBoB0y_wpL1qsOeK4bR1RyJhMMn6I5VjVRI6Y,613
2
- doctra/version.py,sha256=BDWZqR8pRPnlsqLDR4Kx91MC6A9OwylJHhHemdaa6DQ,60
2
+ doctra/version.py,sha256=-8CkxAWlU-OCRJP3Yq9OGjh-4nS4-sU-LRjZ28K6oUw,62
3
3
  doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
4
4
  doctra/cli/main.py,sha256=o_W1b5kx3xaTbWK6l4IYi0YLwffKBj5pQKflnlaG2Fw,35611
5
5
  doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
@@ -14,7 +14,7 @@ doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMY
14
14
  doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
16
16
  doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
17
- doctra/engines/vlm/service.py,sha256=Jwws2Jw68-IdHyvEWks4UCoP7Olhqt8IpXfCv5Z7Ml4,4724
17
+ doctra/engines/vlm/service.py,sha256=4ExDbLmyyC3ICXxr7OSIqvbOdrwbIJek-DE54vAUgDA,4151
18
18
  doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
20
20
  doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
@@ -23,10 +23,10 @@ doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r
23
23
  doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
24
24
  doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
25
25
  doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
26
- doctra/parsers/structured_pdf_parser.py,sha256=fbDIQ6VFv1phFPC3lKgcjtCp0AdNA8Ny1dK0F726Pww,21357
27
- doctra/parsers/table_chart_extractor.py,sha256=JuoScqCQbPdQjy4ak77OcZHSPYKGHF4H39fEW6gF3eo,15323
26
+ doctra/parsers/structured_pdf_parser.py,sha256=QIZIS5SAaIdGiT8o7G_a4D-Cht7nVLGeSuVzqSYLn14,19160
27
+ doctra/parsers/table_chart_extractor.py,sha256=kSubqX0n0kVu_3jzX6QUyKmEGs9sG3Bg9kzUzn2wPHo,13733
28
28
  doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
29
- doctra/ui/app.py,sha256=FYDlEG_2pfp7SSHnA04NRNUhOcI-BJPh3qAf5dw5D6g,45903
29
+ doctra/ui/app.py,sha256=WpXUWHSs7wSYNjY4iBOZJHsKGQ88jDytvOFIjuhqAGE,44031
30
30
  doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
32
32
  doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
@@ -34,11 +34,11 @@ doctra/utils/file_ops.py,sha256=3IS0EQncs6Kaj27fcg2zxQX3xRSvtItIsyKGLYgeOgw,815
34
34
  doctra/utils/io_utils.py,sha256=L1bWV4-ybs2j_3ZEN7GfQVgdC73JKVECVnpwKbP0dy0,219
35
35
  doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
36
36
  doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
37
- doctra/utils/progress.py,sha256=sNEjTdN32J1-eXFPqwZRw2EZQ1SXSesXBd5StJvtlmc,14481
37
+ doctra/utils/progress.py,sha256=IKQ_YErWSEd4hddYMUiCORy0_kW4TOYJM891HUEq2_E,11901
38
38
  doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
39
- doctra/utils/structured_utils.py,sha256=J-qTqo8eCjm36FaRJ_I482LFgYCpm3eukZm-gbNnchw,1401
40
- doctra-0.3.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
41
- doctra-0.3.1.dist-info/METADATA,sha256=2-2aMiNRvofe2WYuYejI6NqSkVctiH5SLK-EX4nIjaE,28298
42
- doctra-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- doctra-0.3.1.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
44
- doctra-0.3.1.dist-info/RECORD,,
39
+ doctra/utils/structured_utils.py,sha256=znC2zr80rZMfIV58lipZ8M4zPq6IF070pdwLBve1qiE,1251
40
+ doctra-0.3.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
41
+ doctra-0.3.3.dist-info/METADATA,sha256=GX4AvDkmBPFcmt0drF84Wy2WuiqB0ivNw_7bMEpHuMc,28298
42
+ doctra-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ doctra-0.3.3.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
44
+ doctra-0.3.3.dist-info/RECORD,,
File without changes