orbitkit 0.8.62__tar.gz → 0.8.63__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {orbitkit-0.8.62/orbitkit.egg-info → orbitkit-0.8.63}/PKG-INFO +1 -1
  2. orbitkit-0.8.63/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +18 -14
  4. {orbitkit-0.8.62 → orbitkit-0.8.63/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.62/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.62 → orbitkit-0.8.63}/LICENSE +0 -0
  7. {orbitkit-0.8.62 → orbitkit-0.8.63}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.62 → orbitkit-0.8.63}/README.md +0 -0
  9. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
  38. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  45. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  46. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  47. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/base.py +0 -0
  48. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  49. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/core.py +0 -0
  50. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  51. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  52. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  53. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_writer/__init__.py +0 -0
  54. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  55. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/__init__.py +0 -0
  56. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/cache_asset_downloader.py +0 -0
  57. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/common.py +0 -0
  58. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/customize_regix_manager.py +0 -0
  59. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/secret_manager.py +0 -0
  60. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/universal_extractor.py +0 -0
  61. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aliyun.py +0 -0
  62. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  63. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aws.py +0 -0
  64. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  65. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_date.py +0 -0
  66. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_html.py +0 -0
  67. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_kafka.py +0 -0
  68. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_md5.py +0 -0
  69. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_selenium.py +0 -0
  70. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_simple_timer.py +0 -0
  71. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_str.py +0 -0
  72. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_type_mapping.py +0 -0
  73. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_url.py +0 -0
  74. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/SOURCES.txt +0 -0
  75. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/dependency_links.txt +0 -0
  76. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/not-zip-safe +0 -0
  77. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/requires.txt +0 -0
  78. {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/top_level.txt +0 -0
  79. {orbitkit-0.8.62 → orbitkit-0.8.63}/setup.cfg +0 -0
  80. {orbitkit-0.8.62 → orbitkit-0.8.63}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.62
3
+ Version: 0.8.63
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.63
@@ -429,29 +429,34 @@ class MinerUExtract:
429
429
  return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
430
430
 
431
431
  def level_two_table(self, block_seq, pages_block, pages_body):
432
- """处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block"""
432
+ """处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block(各自保留 bbox)"""
433
433
  table_blocks = pages_block.get('blocks', [])
434
434
  table_html = ""
435
- text_contents = [] # 收集 table_footnote 等文本内容
435
+ table_bbox = pages_block.get('bbox', [0, 0, 0, 0])
436
436
  _image_detail_arr = []
437
- bbox = pages_block.get('bbox', [0, 0, 0, 0])
437
+ # 收集非 table 类型的子 block(如 table_footnote, table_caption 等),保留各自的 bbox
438
+ text_blocks = []
438
439
 
439
- for table in table_blocks:
440
- pages_lines = table.get("lines", [])
440
+ for sub_block in table_blocks:
441
+ sub_bbox = sub_block.get('bbox', table_bbox)
442
+ pages_lines = sub_block.get("lines", [])
441
443
  for _line in pages_lines:
442
444
  for _span in _line.get('spans', []):
443
445
  span_type = _span.get('type', '')
444
446
  if span_type == 'table' and 'html' in _span:
445
447
  table_html = _span['html']
446
- elif 'content' in _span: # table_footnote 等文本类型
447
- text_contents.append(_span['content'])
448
+ elif 'content' in _span: # table_footnote, table_caption 等文本类型
449
+ text_blocks.append({
450
+ 'content': _span['content'],
451
+ 'bbox': sub_bbox
452
+ })
448
453
  if _span.get('image_path'):
449
454
  _image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
450
455
 
451
456
  # 表格 HTML 单独一个 block
452
457
  if table_html or _image_detail_arr:
453
458
  block_seq = block_seq + 1
454
- text_location = self.get_location(bbox)
459
+ text_location = self.get_location(table_bbox)
455
460
  self.blocks.append({
456
461
  "id": id_srv.get_random_short_id(),
457
462
  "page": self.page,
@@ -463,21 +468,20 @@ class MinerUExtract:
463
468
  })
464
469
  pages_body.append(table_html)
465
470
 
466
- # 脚注等文本单独一个 block
467
- if text_contents:
471
+ # 每个脚注/标题等文本单独一个 block,使用各自的 bbox
472
+ for text_block in text_blocks:
468
473
  block_seq = block_seq + 1
469
- text_location = self.get_location(bbox)
470
- _text_str = '\n'.join(text_contents)
474
+ text_location = self.get_location(text_block['bbox'])
471
475
  self.blocks.append({
472
476
  "id": id_srv.get_random_short_id(),
473
477
  "page": self.page,
474
478
  "seq_no": block_seq,
475
- "sentence": _text_str,
479
+ "sentence": text_block['content'],
476
480
  "type": 'sentence',
477
481
  "image_detail": [],
478
482
  "text_location": text_location
479
483
  })
480
- pages_body.append(_text_str)
484
+ pages_body.append(text_block['content'])
481
485
 
482
486
  return block_seq, pages_body
483
487
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.62
3
+ Version: 0.8.63
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.62
File without changes
File without changes
File without changes
File without changes
File without changes