orbitkit 0.8.61__tar.gz → 0.8.62__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {orbitkit-0.8.61/orbitkit.egg-info → orbitkit-0.8.62}/PKG-INFO +1 -1
  2. orbitkit-0.8.62/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +25 -5
  4. {orbitkit-0.8.61 → orbitkit-0.8.62/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.61/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.61 → orbitkit-0.8.62}/LICENSE +0 -0
  7. {orbitkit-0.8.61 → orbitkit-0.8.62}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.61 → orbitkit-0.8.62}/README.md +0 -0
  9. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
  38. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  45. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  46. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  47. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/base.py +0 -0
  48. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  49. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/core.py +0 -0
  50. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  51. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  52. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  53. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_writer/__init__.py +0 -0
  54. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  55. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/__init__.py +0 -0
  56. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/cache_asset_downloader.py +0 -0
  57. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/common.py +0 -0
  58. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/customize_regix_manager.py +0 -0
  59. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/secret_manager.py +0 -0
  60. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/universal_extractor.py +0 -0
  61. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aliyun.py +0 -0
  62. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  63. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aws.py +0 -0
  64. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  65. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_date.py +0 -0
  66. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_html.py +0 -0
  67. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_kafka.py +0 -0
  68. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_md5.py +0 -0
  69. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_selenium.py +0 -0
  70. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_simple_timer.py +0 -0
  71. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_str.py +0 -0
  72. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_type_mapping.py +0 -0
  73. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_url.py +0 -0
  74. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/SOURCES.txt +0 -0
  75. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/dependency_links.txt +0 -0
  76. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/not-zip-safe +0 -0
  77. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/requires.txt +0 -0
  78. {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/top_level.txt +0 -0
  79. {orbitkit-0.8.61 → orbitkit-0.8.62}/setup.cfg +0 -0
  80. {orbitkit-0.8.61 → orbitkit-0.8.62}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.61
3
+ Version: 0.8.62
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.62
@@ -429,9 +429,10 @@ class MinerUExtract:
429
429
  return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
430
430
 
431
431
  def level_two_table(self, block_seq, pages_block, pages_body):
432
- """处理 table 类型 block,sentence 直接存储 HTML 格式"""
432
+ """处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block"""
433
433
  table_blocks = pages_block.get('blocks', [])
434
434
  table_html = ""
435
+ text_contents = [] # 收集 table_footnote 等文本内容
435
436
  _image_detail_arr = []
436
437
  bbox = pages_block.get('bbox', [0, 0, 0, 0])
437
438
 
@@ -442,23 +443,42 @@ class MinerUExtract:
442
443
  span_type = _span.get('type', '')
443
444
  if span_type == 'table' and 'html' in _span:
444
445
  table_html = _span['html']
446
+ elif 'content' in _span: # table_footnote 等文本类型
447
+ text_contents.append(_span['content'])
445
448
  if _span.get('image_path'):
446
449
  _image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
447
450
 
448
- _block_str = table_html
449
- if _block_str or _image_detail_arr:
451
+ # 表格 HTML 单独一个 block
452
+ if table_html or _image_detail_arr:
450
453
  block_seq = block_seq + 1
451
454
  text_location = self.get_location(bbox)
452
455
  self.blocks.append({
453
456
  "id": id_srv.get_random_short_id(),
454
457
  "page": self.page,
455
458
  "seq_no": block_seq,
456
- "sentence": _block_str,
459
+ "sentence": table_html,
457
460
  "type": 'table',
458
461
  "image_detail": _image_detail_arr,
459
462
  "text_location": text_location
460
463
  })
461
- pages_body.append(_block_str)
464
+ pages_body.append(table_html)
465
+
466
+ # 脚注等文本单独一个 block
467
+ if text_contents:
468
+ block_seq = block_seq + 1
469
+ text_location = self.get_location(bbox)
470
+ _text_str = '\n'.join(text_contents)
471
+ self.blocks.append({
472
+ "id": id_srv.get_random_short_id(),
473
+ "page": self.page,
474
+ "seq_no": block_seq,
475
+ "sentence": _text_str,
476
+ "type": 'sentence',
477
+ "image_detail": [],
478
+ "text_location": text_location
479
+ })
480
+ pages_body.append(_text_str)
481
+
462
482
  return block_seq, pages_body
463
483
 
464
484
  def level_two_image(self, block_seq, pages_block, pages_body):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.61
3
+ Version: 0.8.62
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.61
File without changes
File without changes
File without changes
File without changes
File without changes