orbitkit 0.8.60__tar.gz → 0.8.62__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {orbitkit-0.8.60/orbitkit.egg-info → orbitkit-0.8.62}/PKG-INFO +1 -1
  2. orbitkit-0.8.62/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/audio_transcoder/netmind_extract_v1.py +13 -6
  4. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +27 -10
  5. {orbitkit-0.8.60 → orbitkit-0.8.62/orbitkit.egg-info}/PKG-INFO +1 -1
  6. orbitkit-0.8.60/orbitkit/VERSION +0 -1
  7. {orbitkit-0.8.60 → orbitkit-0.8.62}/LICENSE +0 -0
  8. {orbitkit-0.8.60 → orbitkit-0.8.62}/MANIFEST.in +0 -0
  9. {orbitkit-0.8.60 → orbitkit-0.8.62}/README.md +0 -0
  10. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/__init__.py +0 -0
  11. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/__init__.py +0 -0
  12. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  13. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  14. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  15. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler.py +0 -0
  16. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  17. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/audio_transcoder/__init__.py +0 -0
  18. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
  38. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  45. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  46. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  47. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/base.py +0 -0
  48. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  49. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/core.py +0 -0
  50. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  51. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  52. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  53. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_writer/__init__.py +0 -0
  54. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  55. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/__init__.py +0 -0
  56. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/cache_asset_downloader.py +0 -0
  57. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/common.py +0 -0
  58. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/customize_regix_manager.py +0 -0
  59. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/secret_manager.py +0 -0
  60. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/universal_extractor.py +0 -0
  61. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aliyun.py +0 -0
  62. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  63. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aws.py +0 -0
  64. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  65. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_date.py +0 -0
  66. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_html.py +0 -0
  67. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_kafka.py +0 -0
  68. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_md5.py +0 -0
  69. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_selenium.py +0 -0
  70. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_simple_timer.py +0 -0
  71. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_str.py +0 -0
  72. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_type_mapping.py +0 -0
  73. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_url.py +0 -0
  74. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/SOURCES.txt +0 -0
  75. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/dependency_links.txt +0 -0
  76. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/not-zip-safe +0 -0
  77. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/requires.txt +0 -0
  78. {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/top_level.txt +0 -0
  79. {orbitkit-0.8.60 → orbitkit-0.8.62}/setup.cfg +0 -0
  80. {orbitkit-0.8.60 → orbitkit-0.8.62}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.60
3
+ Version: 0.8.62
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.62
@@ -104,7 +104,7 @@ def text_processing(netmind_data, lang, translate_model='gpt-4.1-mini'):
104
104
  def send_request_to_api(s3_remote_url, **kwargs):
105
105
  endpoint = kwargs['endpoint']
106
106
  token = kwargs['token']
107
- lang = kwargs.get('lang', 'en')
107
+ lang = kwargs.get('lang', None)
108
108
  headers = {
109
109
  'Authorization': token,
110
110
  }
@@ -112,8 +112,9 @@ def send_request_to_api(s3_remote_url, **kwargs):
112
112
  files = {
113
113
  'model': (None, 'WhisperX'),
114
114
  'url': (None, s3_remote_url),
115
- 'language': (None, lang),
116
115
  }
116
+ if lang:
117
+ files['language'] = (None, lang)
117
118
 
118
119
  response = requests.post(endpoint, headers=headers, files=files)
119
120
  response.raise_for_status()
@@ -125,7 +126,7 @@ def send_request_to_api(s3_remote_url, **kwargs):
125
126
  def send_request_to_stream(file_steam, **kwargs):
126
127
  endpoint = kwargs['endpoint']
127
128
  token = kwargs['token']
128
- lang = kwargs.get('lang', 'en')
129
+ lang = kwargs.get('lang', None)
129
130
  headers = {
130
131
  'Authorization': token,
131
132
  }
@@ -133,8 +134,9 @@ def send_request_to_stream(file_steam, **kwargs):
133
134
  files = {
134
135
  'model': (None, 'WhisperX'),
135
136
  'files': (None, file_steam),
136
- 'language': (None, lang),
137
137
  }
138
+ if lang:
139
+ files['language'] = (None, lang)
138
140
 
139
141
  response = requests.post(endpoint, headers=headers, files=files)
140
142
  response.raise_for_status()
@@ -143,7 +145,7 @@ def send_request_to_stream(file_steam, **kwargs):
143
145
 
144
146
 
145
147
  def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs):
146
- lang = kwargs.get('lang', 'en')
148
+ lang = kwargs.get('lang', None)
147
149
  folder = kwargs.get('folder', '')
148
150
  translate_model = kwargs.get('translate_model', 'gpt-4.1-mini')
149
151
  if s3_path:
@@ -166,7 +168,12 @@ def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs)
166
168
  with open(json_netmind_wav_path, 'w', encoding='utf-8') as json_file:
167
169
  json.dump(data, json_file, ensure_ascii=False, indent=4)
168
170
 
169
- net_process = text_processing(data, lang, translate_model=translate_model)
171
+ # lang None 时跳过翻译,直接使用原始结果
172
+ if lang:
173
+ net_process = text_processing(data, lang, translate_model=translate_model)
174
+ else:
175
+ logger.info("lang=None, 跳过翻译,保持原语言")
176
+ net_process = data
170
177
 
171
178
  # 翻译接口处理
172
179
  json_netmind_lang_wav_path = os.path.join(folder, 'netmind_lang_wav.json')
@@ -338,12 +338,10 @@ class MinerUExtract:
338
338
  return self.default_extractable(pdf_pages, 'discarded_blocks', pages_body, block_seq)
339
339
 
340
340
  def default_blocks(self, pdf_pages):
341
- print(pdf_pages.keys())
342
- raise Exception('提取block方法异常')
341
+ raise Exception(f'提取block方法异常,未知参数: {list(pdf_pages.keys())}')
343
342
 
344
343
  def default_extractable(self, pdf_pages, block_key, pages_body, block_seq):
345
344
  for pages_block in pdf_pages.get(block_key, []):
346
- print(pages_block['type'])
347
345
  handler_type_func = getattr(self, f'level_two_{pages_block["type"]}', self.level_two_default_blocks)
348
346
  block_seq, pages_body = handler_type_func(block_seq, pages_block, pages_body)
349
347
  return pages_body
@@ -431,9 +429,10 @@ class MinerUExtract:
431
429
  return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
432
430
 
433
431
  def level_two_table(self, block_seq, pages_block, pages_body):
434
- """处理 table 类型 block,sentence 直接存储 HTML 格式"""
432
+ """处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block"""
435
433
  table_blocks = pages_block.get('blocks', [])
436
434
  table_html = ""
435
+ text_contents = [] # 收集 table_footnote 等文本内容
437
436
  _image_detail_arr = []
438
437
  bbox = pages_block.get('bbox', [0, 0, 0, 0])
439
438
 
@@ -444,23 +443,42 @@ class MinerUExtract:
444
443
  span_type = _span.get('type', '')
445
444
  if span_type == 'table' and 'html' in _span:
446
445
  table_html = _span['html']
446
+ elif 'content' in _span: # table_footnote 等文本类型
447
+ text_contents.append(_span['content'])
447
448
  if _span.get('image_path'):
448
449
  _image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
449
450
 
450
- _block_str = table_html
451
- if _block_str or _image_detail_arr:
451
+ # 表格 HTML 单独一个 block
452
+ if table_html or _image_detail_arr:
452
453
  block_seq = block_seq + 1
453
454
  text_location = self.get_location(bbox)
454
455
  self.blocks.append({
455
456
  "id": id_srv.get_random_short_id(),
456
457
  "page": self.page,
457
458
  "seq_no": block_seq,
458
- "sentence": _block_str,
459
+ "sentence": table_html,
459
460
  "type": 'table',
460
461
  "image_detail": _image_detail_arr,
461
462
  "text_location": text_location
462
463
  })
463
- pages_body.append(_block_str)
464
+ pages_body.append(table_html)
465
+
466
+ # 脚注等文本单独一个 block
467
+ if text_contents:
468
+ block_seq = block_seq + 1
469
+ text_location = self.get_location(bbox)
470
+ _text_str = '\n'.join(text_contents)
471
+ self.blocks.append({
472
+ "id": id_srv.get_random_short_id(),
473
+ "page": self.page,
474
+ "seq_no": block_seq,
475
+ "sentence": _text_str,
476
+ "type": 'sentence',
477
+ "image_detail": [],
478
+ "text_location": text_location
479
+ })
480
+ pages_body.append(_text_str)
481
+
464
482
  return block_seq, pages_body
465
483
 
466
484
  def level_two_image(self, block_seq, pages_block, pages_body):
@@ -541,8 +559,7 @@ class MinerUExtract:
541
559
  return block_seq, pages_body
542
560
 
543
561
  def level_two_default_blocks(self, block_seq, pages_block, pages_body):
544
- print(pages_block.keys())
545
- raise Exception('提取block方法异常')
562
+ raise Exception(f'提取block方法异常,未知类型: {pages_block.get("type")},参数: {list(pages_block.keys())}')
546
563
 
547
564
  def get_com_lines(self, pages_lines):
548
565
  _block_arr = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.60
3
+ Version: 0.8.62
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.60
File without changes
File without changes
File without changes
File without changes
File without changes