orbitkit 0.8.75__tar.gz → 0.8.77__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {orbitkit-0.8.75/orbitkit.egg-info → orbitkit-0.8.77}/PKG-INFO +1 -1
  2. orbitkit-0.8.77/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/airflow_handler/data_preprocessing.py +1 -1
  4. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +34 -3
  5. {orbitkit-0.8.75 → orbitkit-0.8.77/orbitkit.egg-info}/PKG-INFO +1 -1
  6. orbitkit-0.8.75/orbitkit/VERSION +0 -1
  7. {orbitkit-0.8.75 → orbitkit-0.8.77}/LICENSE +0 -0
  8. {orbitkit-0.8.75 → orbitkit-0.8.77}/MANIFEST.in +0 -0
  9. {orbitkit-0.8.75 → orbitkit-0.8.77}/README.md +0 -0
  10. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/__init__.py +0 -0
  11. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/airflow_handler/__init__.py +0 -0
  12. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
  38. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  45. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  46. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  47. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/base.py +0 -0
  48. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  49. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/core.py +0 -0
  50. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  51. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  52. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  53. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_writer/__init__.py +0 -0
  54. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  55. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/__init__.py +0 -0
  56. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/cache_asset_downloader.py +0 -0
  57. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/common.py +0 -0
  58. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/customize_regix_manager.py +0 -0
  59. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/is_xbrl_structure.py +0 -0
  60. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/secret_manager.py +0 -0
  61. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/universal_extractor.py +0 -0
  62. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_aliyun.py +0 -0
  63. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  64. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_aws.py +0 -0
  65. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  66. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_date.py +0 -0
  67. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_html.py +0 -0
  68. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_kafka.py +0 -0
  69. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_md5.py +0 -0
  70. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_selenium.py +0 -0
  71. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_simple_timer.py +0 -0
  72. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_str.py +0 -0
  73. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_type_mapping.py +0 -0
  74. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit/util/util_url.py +0 -0
  75. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit.egg-info/SOURCES.txt +0 -0
  76. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit.egg-info/dependency_links.txt +0 -0
  77. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit.egg-info/not-zip-safe +0 -0
  78. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit.egg-info/requires.txt +0 -0
  79. {orbitkit-0.8.75 → orbitkit-0.8.77}/orbitkit.egg-info/top_level.txt +0 -0
  80. {orbitkit-0.8.75 → orbitkit-0.8.77}/setup.cfg +0 -0
  81. {orbitkit-0.8.75 → orbitkit-0.8.77}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.75
3
+ Version: 0.8.77
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.77
@@ -118,7 +118,7 @@ class DocumentProcessor:
118
118
  @staticmethod
119
119
  def update_target_stage_by_report_type(doc, target_stage):
120
120
  report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
121
- allow_embedding_types = {'10002', '10085', '10076', '10122', '10311', '10178', '10075', '10090', '10050'}
121
+ allow_embedding_types = {'10122'}
122
122
  return target_stage if any(rid in allow_embedding_types for rid in report_type_ids) else "extract"
123
123
 
124
124
  @staticmethod
@@ -54,8 +54,8 @@ class MinerUApiBase:
54
54
 
55
55
  BASE_URL = "https://mineru.net/api/v4"
56
56
  MAX_FILE_SIZE_MB = 200 # 最大文件大小 MB
57
- MAX_PAGES = 600 # 最大页数
58
- SPLIT_PAGES = 500 # 拆分时每个分片的页数(留余量)
57
+ MAX_PAGES = 200 # 最大页数(MinerU 服务端硬限制)
58
+ SPLIT_PAGES = 150 # 拆分时每个分片的页数(留 10% 余量避免边界踩限)
59
59
  MAX_BATCH_FILES = 200 # 单次批量上传最大文件数
60
60
 
61
61
  def __init__(self, api_token: str, temp_dir: str = None):
@@ -543,6 +543,37 @@ class MinerUExtract:
543
543
  block_seq = block_seq - 1
544
544
  return block_seq, pages_body
545
545
 
546
+ def level_two_chart(self, block_seq, pages_block, pages_body):
547
+ pages_chart_blocks = pages_block["blocks"]
548
+ for _pages_chart_blocks in pages_chart_blocks:
549
+ block_seq = block_seq + 1
550
+ _pages_chart_blocks_type = _pages_chart_blocks['type']
551
+ pages_chart_blocks_line = _pages_chart_blocks["lines"]
552
+ bbox = pages_block['bbox']
553
+ if _pages_chart_blocks_type == 'chart_body':
554
+ block_type = 'chart'
555
+ elif _pages_chart_blocks_type in ['chart_caption', 'chart_footnote']:
556
+ block_type = 'sentence'
557
+ else:
558
+ raise Exception(f'Chart 异常目标值 {_pages_chart_blocks_type}:')
559
+ text_location = self.get_location(bbox)
560
+ _block_arr, _image_detail_arr = self.get_com_lines(pages_chart_blocks_line)
561
+ if len(_block_arr) > 0 or len(_image_detail_arr) > 0:
562
+ _block_arr_str = '\n'.join(_block_arr)
563
+ self.blocks.append({
564
+ "id": id_srv.get_random_short_id(),
565
+ "page": self.page,
566
+ "seq_no": block_seq,
567
+ "sentence": _block_arr_str,
568
+ "type": block_type,
569
+ "image_detail": _image_detail_arr,
570
+ "text_location": text_location
571
+ })
572
+ else:
573
+ if block_seq > 0:
574
+ block_seq = block_seq - 1
575
+ return block_seq, pages_body
576
+
546
577
  def level_two_interline_equation(self, block_seq, pages_block, pages_body):
547
578
  block_seq = block_seq + 1
548
579
  bbox = pages_block['bbox']
@@ -602,7 +633,7 @@ class MinerUExtract:
602
633
  _c_s.append(_l['content'])
603
634
  elif _l['type'] in ['table']:
604
635
  _c_s.append(_l['html'])
605
- elif _l['type'] in ['image']:
636
+ elif _l['type'] in ['image', 'chart']:
606
637
  pass
607
638
  else:
608
639
  raise Exception(f'类型匹配异常 意外值: {_l["type"]}')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.75
3
+ Version: 0.8.77
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.75
File without changes
File without changes
File without changes
File without changes
File without changes