orbitkit 0.8.60__tar.gz → 0.8.62__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.60/orbitkit.egg-info → orbitkit-0.8.62}/PKG-INFO +1 -1
- orbitkit-0.8.62/orbitkit/VERSION +1 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/audio_transcoder/netmind_extract_v1.py +13 -6
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +27 -10
- {orbitkit-0.8.60 → orbitkit-0.8.62/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.60/orbitkit/VERSION +0 -1
- {orbitkit-0.8.60 → orbitkit-0.8.62}/LICENSE +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/MANIFEST.in +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/README.md +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/universal_extractor.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/setup.cfg +0 -0
- {orbitkit-0.8.60 → orbitkit-0.8.62}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.62
|
|
@@ -104,7 +104,7 @@ def text_processing(netmind_data, lang, translate_model='gpt-4.1-mini'):
|
|
|
104
104
|
def send_request_to_api(s3_remote_url, **kwargs):
|
|
105
105
|
endpoint = kwargs['endpoint']
|
|
106
106
|
token = kwargs['token']
|
|
107
|
-
lang = kwargs.get('lang',
|
|
107
|
+
lang = kwargs.get('lang', None)
|
|
108
108
|
headers = {
|
|
109
109
|
'Authorization': token,
|
|
110
110
|
}
|
|
@@ -112,8 +112,9 @@ def send_request_to_api(s3_remote_url, **kwargs):
|
|
|
112
112
|
files = {
|
|
113
113
|
'model': (None, 'WhisperX'),
|
|
114
114
|
'url': (None, s3_remote_url),
|
|
115
|
-
'language': (None, lang),
|
|
116
115
|
}
|
|
116
|
+
if lang:
|
|
117
|
+
files['language'] = (None, lang)
|
|
117
118
|
|
|
118
119
|
response = requests.post(endpoint, headers=headers, files=files)
|
|
119
120
|
response.raise_for_status()
|
|
@@ -125,7 +126,7 @@ def send_request_to_api(s3_remote_url, **kwargs):
|
|
|
125
126
|
def send_request_to_stream(file_steam, **kwargs):
|
|
126
127
|
endpoint = kwargs['endpoint']
|
|
127
128
|
token = kwargs['token']
|
|
128
|
-
lang = kwargs.get('lang',
|
|
129
|
+
lang = kwargs.get('lang', None)
|
|
129
130
|
headers = {
|
|
130
131
|
'Authorization': token,
|
|
131
132
|
}
|
|
@@ -133,8 +134,9 @@ def send_request_to_stream(file_steam, **kwargs):
|
|
|
133
134
|
files = {
|
|
134
135
|
'model': (None, 'WhisperX'),
|
|
135
136
|
'files': (None, file_steam),
|
|
136
|
-
'language': (None, lang),
|
|
137
137
|
}
|
|
138
|
+
if lang:
|
|
139
|
+
files['language'] = (None, lang)
|
|
138
140
|
|
|
139
141
|
response = requests.post(endpoint, headers=headers, files=files)
|
|
140
142
|
response.raise_for_status()
|
|
@@ -143,7 +145,7 @@ def send_request_to_stream(file_steam, **kwargs):
|
|
|
143
145
|
|
|
144
146
|
|
|
145
147
|
def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs):
|
|
146
|
-
lang = kwargs.get('lang',
|
|
148
|
+
lang = kwargs.get('lang', None)
|
|
147
149
|
folder = kwargs.get('folder', '')
|
|
148
150
|
translate_model = kwargs.get('translate_model', 'gpt-4.1-mini')
|
|
149
151
|
if s3_path:
|
|
@@ -166,7 +168,12 @@ def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs)
|
|
|
166
168
|
with open(json_netmind_wav_path, 'w', encoding='utf-8') as json_file:
|
|
167
169
|
json.dump(data, json_file, ensure_ascii=False, indent=4)
|
|
168
170
|
|
|
169
|
-
|
|
171
|
+
# lang 为 None 时跳过翻译,直接使用原始结果
|
|
172
|
+
if lang:
|
|
173
|
+
net_process = text_processing(data, lang, translate_model=translate_model)
|
|
174
|
+
else:
|
|
175
|
+
logger.info("lang=None, 跳过翻译,保持原语言")
|
|
176
|
+
net_process = data
|
|
170
177
|
|
|
171
178
|
# 翻译接口处理
|
|
172
179
|
json_netmind_lang_wav_path = os.path.join(folder, 'netmind_lang_wav.json')
|
|
@@ -338,12 +338,10 @@ class MinerUExtract:
|
|
|
338
338
|
return self.default_extractable(pdf_pages, 'discarded_blocks', pages_body, block_seq)
|
|
339
339
|
|
|
340
340
|
def default_blocks(self, pdf_pages):
|
|
341
|
-
|
|
342
|
-
raise Exception('提取block方法异常')
|
|
341
|
+
raise Exception(f'提取block方法异常,未知参数: {list(pdf_pages.keys())}')
|
|
343
342
|
|
|
344
343
|
def default_extractable(self, pdf_pages, block_key, pages_body, block_seq):
|
|
345
344
|
for pages_block in pdf_pages.get(block_key, []):
|
|
346
|
-
print(pages_block['type'])
|
|
347
345
|
handler_type_func = getattr(self, f'level_two_{pages_block["type"]}', self.level_two_default_blocks)
|
|
348
346
|
block_seq, pages_body = handler_type_func(block_seq, pages_block, pages_body)
|
|
349
347
|
return pages_body
|
|
@@ -431,9 +429,10 @@ class MinerUExtract:
|
|
|
431
429
|
return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
|
|
432
430
|
|
|
433
431
|
def level_two_table(self, block_seq, pages_block, pages_body):
|
|
434
|
-
"""处理 table 类型 block,sentence 直接存储 HTML
|
|
432
|
+
"""处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block"""
|
|
435
433
|
table_blocks = pages_block.get('blocks', [])
|
|
436
434
|
table_html = ""
|
|
435
|
+
text_contents = [] # 收集 table_footnote 等文本内容
|
|
437
436
|
_image_detail_arr = []
|
|
438
437
|
bbox = pages_block.get('bbox', [0, 0, 0, 0])
|
|
439
438
|
|
|
@@ -444,23 +443,42 @@ class MinerUExtract:
|
|
|
444
443
|
span_type = _span.get('type', '')
|
|
445
444
|
if span_type == 'table' and 'html' in _span:
|
|
446
445
|
table_html = _span['html']
|
|
446
|
+
elif 'content' in _span: # table_footnote 等文本类型
|
|
447
|
+
text_contents.append(_span['content'])
|
|
447
448
|
if _span.get('image_path'):
|
|
448
449
|
_image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
|
|
449
450
|
|
|
450
|
-
|
|
451
|
-
if
|
|
451
|
+
# 表格 HTML 单独一个 block
|
|
452
|
+
if table_html or _image_detail_arr:
|
|
452
453
|
block_seq = block_seq + 1
|
|
453
454
|
text_location = self.get_location(bbox)
|
|
454
455
|
self.blocks.append({
|
|
455
456
|
"id": id_srv.get_random_short_id(),
|
|
456
457
|
"page": self.page,
|
|
457
458
|
"seq_no": block_seq,
|
|
458
|
-
"sentence":
|
|
459
|
+
"sentence": table_html,
|
|
459
460
|
"type": 'table',
|
|
460
461
|
"image_detail": _image_detail_arr,
|
|
461
462
|
"text_location": text_location
|
|
462
463
|
})
|
|
463
|
-
pages_body.append(
|
|
464
|
+
pages_body.append(table_html)
|
|
465
|
+
|
|
466
|
+
# 脚注等文本单独一个 block
|
|
467
|
+
if text_contents:
|
|
468
|
+
block_seq = block_seq + 1
|
|
469
|
+
text_location = self.get_location(bbox)
|
|
470
|
+
_text_str = '\n'.join(text_contents)
|
|
471
|
+
self.blocks.append({
|
|
472
|
+
"id": id_srv.get_random_short_id(),
|
|
473
|
+
"page": self.page,
|
|
474
|
+
"seq_no": block_seq,
|
|
475
|
+
"sentence": _text_str,
|
|
476
|
+
"type": 'sentence',
|
|
477
|
+
"image_detail": [],
|
|
478
|
+
"text_location": text_location
|
|
479
|
+
})
|
|
480
|
+
pages_body.append(_text_str)
|
|
481
|
+
|
|
464
482
|
return block_seq, pages_body
|
|
465
483
|
|
|
466
484
|
def level_two_image(self, block_seq, pages_block, pages_body):
|
|
@@ -541,8 +559,7 @@ class MinerUExtract:
|
|
|
541
559
|
return block_seq, pages_body
|
|
542
560
|
|
|
543
561
|
def level_two_default_blocks(self, block_seq, pages_block, pages_body):
|
|
544
|
-
|
|
545
|
-
raise Exception('提取block方法异常')
|
|
562
|
+
raise Exception(f'提取block方法异常,未知类型: {pages_block.get("type")},参数: {list(pages_block.keys())}')
|
|
546
563
|
|
|
547
564
|
def get_com_lines(self, pages_lines):
|
|
548
565
|
_block_arr = []
|
orbitkit-0.8.60/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.60
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|