orbitkit 0.8.62__tar.gz → 0.8.63__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.62/orbitkit.egg-info → orbitkit-0.8.63}/PKG-INFO +1 -1
- orbitkit-0.8.63/orbitkit/VERSION +1 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +18 -14
- {orbitkit-0.8.62 → orbitkit-0.8.63/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.62/orbitkit/VERSION +0 -1
- {orbitkit-0.8.62 → orbitkit-0.8.63}/LICENSE +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/MANIFEST.in +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/README.md +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/universal_extractor.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/setup.cfg +0 -0
- {orbitkit-0.8.62 → orbitkit-0.8.63}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.63
|
|
@@ -429,29 +429,34 @@ class MinerUExtract:
|
|
|
429
429
|
return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
|
|
430
430
|
|
|
431
431
|
def level_two_table(self, block_seq, pages_block, pages_body):
|
|
432
|
-
"""处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block"""
|
|
432
|
+
"""处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block(各自保留 bbox)"""
|
|
433
433
|
table_blocks = pages_block.get('blocks', [])
|
|
434
434
|
table_html = ""
|
|
435
|
-
|
|
435
|
+
table_bbox = pages_block.get('bbox', [0, 0, 0, 0])
|
|
436
436
|
_image_detail_arr = []
|
|
437
|
-
|
|
437
|
+
# 收集非 table 类型的子 block(如 table_footnote, table_caption 等),保留各自的 bbox
|
|
438
|
+
text_blocks = []
|
|
438
439
|
|
|
439
|
-
for
|
|
440
|
-
|
|
440
|
+
for sub_block in table_blocks:
|
|
441
|
+
sub_bbox = sub_block.get('bbox', table_bbox)
|
|
442
|
+
pages_lines = sub_block.get("lines", [])
|
|
441
443
|
for _line in pages_lines:
|
|
442
444
|
for _span in _line.get('spans', []):
|
|
443
445
|
span_type = _span.get('type', '')
|
|
444
446
|
if span_type == 'table' and 'html' in _span:
|
|
445
447
|
table_html = _span['html']
|
|
446
|
-
elif 'content' in _span: # table_footnote 等文本类型
|
|
447
|
-
|
|
448
|
+
elif 'content' in _span: # table_footnote, table_caption 等文本类型
|
|
449
|
+
text_blocks.append({
|
|
450
|
+
'content': _span['content'],
|
|
451
|
+
'bbox': sub_bbox
|
|
452
|
+
})
|
|
448
453
|
if _span.get('image_path'):
|
|
449
454
|
_image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
|
|
450
455
|
|
|
451
456
|
# 表格 HTML 单独一个 block
|
|
452
457
|
if table_html or _image_detail_arr:
|
|
453
458
|
block_seq = block_seq + 1
|
|
454
|
-
text_location = self.get_location(
|
|
459
|
+
text_location = self.get_location(table_bbox)
|
|
455
460
|
self.blocks.append({
|
|
456
461
|
"id": id_srv.get_random_short_id(),
|
|
457
462
|
"page": self.page,
|
|
@@ -463,21 +468,20 @@ class MinerUExtract:
|
|
|
463
468
|
})
|
|
464
469
|
pages_body.append(table_html)
|
|
465
470
|
|
|
466
|
-
#
|
|
467
|
-
|
|
471
|
+
# 每个脚注/标题等文本单独一个 block,使用各自的 bbox
|
|
472
|
+
for text_block in text_blocks:
|
|
468
473
|
block_seq = block_seq + 1
|
|
469
|
-
text_location = self.get_location(bbox)
|
|
470
|
-
_text_str = '\n'.join(text_contents)
|
|
474
|
+
text_location = self.get_location(text_block['bbox'])
|
|
471
475
|
self.blocks.append({
|
|
472
476
|
"id": id_srv.get_random_short_id(),
|
|
473
477
|
"page": self.page,
|
|
474
478
|
"seq_no": block_seq,
|
|
475
|
-
"sentence":
|
|
479
|
+
"sentence": text_block['content'],
|
|
476
480
|
"type": 'sentence',
|
|
477
481
|
"image_detail": [],
|
|
478
482
|
"text_location": text_location
|
|
479
483
|
})
|
|
480
|
-
pages_body.append(
|
|
484
|
+
pages_body.append(text_block['content'])
|
|
481
485
|
|
|
482
486
|
return block_seq, pages_body
|
|
483
487
|
|
orbitkit-0.8.62/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.62
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|