orbitkit 0.8.61__tar.gz → 0.8.63__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.61/orbitkit.egg-info → orbitkit-0.8.63}/PKG-INFO +1 -1
- orbitkit-0.8.63/orbitkit/VERSION +1 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +33 -9
- {orbitkit-0.8.61 → orbitkit-0.8.63/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.61/orbitkit/VERSION +0 -1
- {orbitkit-0.8.61 → orbitkit-0.8.63}/LICENSE +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/MANIFEST.in +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/README.md +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/universal_extractor.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/setup.cfg +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.63}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.63
|
|
@@ -429,36 +429,60 @@ class MinerUExtract:
|
|
|
429
429
|
return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
|
|
430
430
|
|
|
431
431
|
def level_two_table(self, block_seq, pages_block, pages_body):
|
|
432
|
-
"""处理 table 类型 block,sentence 直接存储 HTML
|
|
432
|
+
"""处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block(各自保留 bbox)"""
|
|
433
433
|
table_blocks = pages_block.get('blocks', [])
|
|
434
434
|
table_html = ""
|
|
435
|
+
table_bbox = pages_block.get('bbox', [0, 0, 0, 0])
|
|
435
436
|
_image_detail_arr = []
|
|
436
|
-
|
|
437
|
+
# 收集非 table 类型的子 block(如 table_footnote, table_caption 等),保留各自的 bbox
|
|
438
|
+
text_blocks = []
|
|
437
439
|
|
|
438
|
-
for
|
|
439
|
-
|
|
440
|
+
for sub_block in table_blocks:
|
|
441
|
+
sub_bbox = sub_block.get('bbox', table_bbox)
|
|
442
|
+
pages_lines = sub_block.get("lines", [])
|
|
440
443
|
for _line in pages_lines:
|
|
441
444
|
for _span in _line.get('spans', []):
|
|
442
445
|
span_type = _span.get('type', '')
|
|
443
446
|
if span_type == 'table' and 'html' in _span:
|
|
444
447
|
table_html = _span['html']
|
|
448
|
+
elif 'content' in _span: # table_footnote, table_caption 等文本类型
|
|
449
|
+
text_blocks.append({
|
|
450
|
+
'content': _span['content'],
|
|
451
|
+
'bbox': sub_bbox
|
|
452
|
+
})
|
|
445
453
|
if _span.get('image_path'):
|
|
446
454
|
_image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
|
|
447
455
|
|
|
448
|
-
|
|
449
|
-
if
|
|
456
|
+
# 表格 HTML 单独一个 block
|
|
457
|
+
if table_html or _image_detail_arr:
|
|
450
458
|
block_seq = block_seq + 1
|
|
451
|
-
text_location = self.get_location(
|
|
459
|
+
text_location = self.get_location(table_bbox)
|
|
452
460
|
self.blocks.append({
|
|
453
461
|
"id": id_srv.get_random_short_id(),
|
|
454
462
|
"page": self.page,
|
|
455
463
|
"seq_no": block_seq,
|
|
456
|
-
"sentence":
|
|
464
|
+
"sentence": table_html,
|
|
457
465
|
"type": 'table',
|
|
458
466
|
"image_detail": _image_detail_arr,
|
|
459
467
|
"text_location": text_location
|
|
460
468
|
})
|
|
461
|
-
pages_body.append(
|
|
469
|
+
pages_body.append(table_html)
|
|
470
|
+
|
|
471
|
+
# 每个脚注/标题等文本单独一个 block,使用各自的 bbox
|
|
472
|
+
for text_block in text_blocks:
|
|
473
|
+
block_seq = block_seq + 1
|
|
474
|
+
text_location = self.get_location(text_block['bbox'])
|
|
475
|
+
self.blocks.append({
|
|
476
|
+
"id": id_srv.get_random_short_id(),
|
|
477
|
+
"page": self.page,
|
|
478
|
+
"seq_no": block_seq,
|
|
479
|
+
"sentence": text_block['content'],
|
|
480
|
+
"type": 'sentence',
|
|
481
|
+
"image_detail": [],
|
|
482
|
+
"text_location": text_location
|
|
483
|
+
})
|
|
484
|
+
pages_body.append(text_block['content'])
|
|
485
|
+
|
|
462
486
|
return block_seq, pages_body
|
|
463
487
|
|
|
464
488
|
def level_two_image(self, block_seq, pages_block, pages_body):
|
orbitkit-0.8.61/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.61
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|