orbitkit 0.8.61__tar.gz → 0.8.63__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {orbitkit-0.8.61/orbitkit.egg-info → orbitkit-0.8.63}/PKG-INFO +1 -1
  2. orbitkit-0.8.63/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +33 -9
  4. {orbitkit-0.8.61 → orbitkit-0.8.63/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.61/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.61 → orbitkit-0.8.63}/LICENSE +0 -0
  7. {orbitkit-0.8.61 → orbitkit-0.8.63}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.61 → orbitkit-0.8.63}/README.md +0 -0
  9. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
  38. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  45. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  46. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  47. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/base.py +0 -0
  48. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  49. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/core.py +0 -0
  50. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  51. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  52. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  53. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_writer/__init__.py +0 -0
  54. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  55. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/__init__.py +0 -0
  56. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/cache_asset_downloader.py +0 -0
  57. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/common.py +0 -0
  58. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/customize_regix_manager.py +0 -0
  59. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/secret_manager.py +0 -0
  60. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/universal_extractor.py +0 -0
  61. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aliyun.py +0 -0
  62. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  63. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aws.py +0 -0
  64. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  65. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_date.py +0 -0
  66. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_html.py +0 -0
  67. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_kafka.py +0 -0
  68. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_md5.py +0 -0
  69. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_selenium.py +0 -0
  70. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_simple_timer.py +0 -0
  71. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_str.py +0 -0
  72. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_type_mapping.py +0 -0
  73. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_url.py +0 -0
  74. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/SOURCES.txt +0 -0
  75. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/dependency_links.txt +0 -0
  76. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/not-zip-safe +0 -0
  77. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/requires.txt +0 -0
  78. {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/top_level.txt +0 -0
  79. {orbitkit-0.8.61 → orbitkit-0.8.63}/setup.cfg +0 -0
  80. {orbitkit-0.8.61 → orbitkit-0.8.63}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.61
3
+ Version: 0.8.63
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.63
@@ -429,36 +429,60 @@ class MinerUExtract:
429
429
  return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
430
430
 
431
431
  def level_two_table(self, block_seq, pages_block, pages_body):
432
- """处理 table 类型 block,sentence 直接存储 HTML 格式"""
432
+ """处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block(各自保留 bbox)"""
433
433
  table_blocks = pages_block.get('blocks', [])
434
434
  table_html = ""
435
+ table_bbox = pages_block.get('bbox', [0, 0, 0, 0])
435
436
  _image_detail_arr = []
436
- bbox = pages_block.get('bbox', [0, 0, 0, 0])
437
+ # 收集非 table 类型的子 block(如 table_footnote, table_caption 等),保留各自的 bbox
438
+ text_blocks = []
437
439
 
438
- for table in table_blocks:
439
- pages_lines = table.get("lines", [])
440
+ for sub_block in table_blocks:
441
+ sub_bbox = sub_block.get('bbox', table_bbox)
442
+ pages_lines = sub_block.get("lines", [])
440
443
  for _line in pages_lines:
441
444
  for _span in _line.get('spans', []):
442
445
  span_type = _span.get('type', '')
443
446
  if span_type == 'table' and 'html' in _span:
444
447
  table_html = _span['html']
448
+ elif 'content' in _span: # table_footnote, table_caption 等文本类型
449
+ text_blocks.append({
450
+ 'content': _span['content'],
451
+ 'bbox': sub_bbox
452
+ })
445
453
  if _span.get('image_path'):
446
454
  _image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
447
455
 
448
- _block_str = table_html
449
- if _block_str or _image_detail_arr:
456
+ # 表格 HTML 单独一个 block
457
+ if table_html or _image_detail_arr:
450
458
  block_seq = block_seq + 1
451
- text_location = self.get_location(bbox)
459
+ text_location = self.get_location(table_bbox)
452
460
  self.blocks.append({
453
461
  "id": id_srv.get_random_short_id(),
454
462
  "page": self.page,
455
463
  "seq_no": block_seq,
456
- "sentence": _block_str,
464
+ "sentence": table_html,
457
465
  "type": 'table',
458
466
  "image_detail": _image_detail_arr,
459
467
  "text_location": text_location
460
468
  })
461
- pages_body.append(_block_str)
469
+ pages_body.append(table_html)
470
+
471
+ # 每个脚注/标题等文本单独一个 block,使用各自的 bbox
472
+ for text_block in text_blocks:
473
+ block_seq = block_seq + 1
474
+ text_location = self.get_location(text_block['bbox'])
475
+ self.blocks.append({
476
+ "id": id_srv.get_random_short_id(),
477
+ "page": self.page,
478
+ "seq_no": block_seq,
479
+ "sentence": text_block['content'],
480
+ "type": 'sentence',
481
+ "image_detail": [],
482
+ "text_location": text_location
483
+ })
484
+ pages_body.append(text_block['content'])
485
+
462
486
  return block_seq, pages_body
463
487
 
464
488
  def level_two_image(self, block_seq, pages_block, pages_body):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.61
3
+ Version: 0.8.63
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.61
File without changes
File without changes
File without changes
File without changes
File without changes