orbitkit 0.8.61__tar.gz → 0.8.62__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.61/orbitkit.egg-info → orbitkit-0.8.62}/PKG-INFO +1 -1
- orbitkit-0.8.62/orbitkit/VERSION +1 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +25 -5
- {orbitkit-0.8.61 → orbitkit-0.8.62/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.61/orbitkit/VERSION +0 -1
- {orbitkit-0.8.61 → orbitkit-0.8.62}/LICENSE +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/MANIFEST.in +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/README.md +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/universal_extractor.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/setup.cfg +0 -0
- {orbitkit-0.8.61 → orbitkit-0.8.62}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.62
|
|
@@ -429,9 +429,10 @@ class MinerUExtract:
|
|
|
429
429
|
return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
|
|
430
430
|
|
|
431
431
|
def level_two_table(self, block_seq, pages_block, pages_body):
|
|
432
|
-
"""处理 table 类型 block,sentence 直接存储 HTML
|
|
432
|
+
"""处理 table 类型 block,sentence 直接存储 HTML 格式,脚注等文本单独生成 block"""
|
|
433
433
|
table_blocks = pages_block.get('blocks', [])
|
|
434
434
|
table_html = ""
|
|
435
|
+
text_contents = [] # 收集 table_footnote 等文本内容
|
|
435
436
|
_image_detail_arr = []
|
|
436
437
|
bbox = pages_block.get('bbox', [0, 0, 0, 0])
|
|
437
438
|
|
|
@@ -442,23 +443,42 @@ class MinerUExtract:
|
|
|
442
443
|
span_type = _span.get('type', '')
|
|
443
444
|
if span_type == 'table' and 'html' in _span:
|
|
444
445
|
table_html = _span['html']
|
|
446
|
+
elif 'content' in _span: # table_footnote 等文本类型
|
|
447
|
+
text_contents.append(_span['content'])
|
|
445
448
|
if _span.get('image_path'):
|
|
446
449
|
_image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
|
|
447
450
|
|
|
448
|
-
|
|
449
|
-
if
|
|
451
|
+
# 表格 HTML 单独一个 block
|
|
452
|
+
if table_html or _image_detail_arr:
|
|
450
453
|
block_seq = block_seq + 1
|
|
451
454
|
text_location = self.get_location(bbox)
|
|
452
455
|
self.blocks.append({
|
|
453
456
|
"id": id_srv.get_random_short_id(),
|
|
454
457
|
"page": self.page,
|
|
455
458
|
"seq_no": block_seq,
|
|
456
|
-
"sentence":
|
|
459
|
+
"sentence": table_html,
|
|
457
460
|
"type": 'table',
|
|
458
461
|
"image_detail": _image_detail_arr,
|
|
459
462
|
"text_location": text_location
|
|
460
463
|
})
|
|
461
|
-
pages_body.append(
|
|
464
|
+
pages_body.append(table_html)
|
|
465
|
+
|
|
466
|
+
# 脚注等文本单独一个 block
|
|
467
|
+
if text_contents:
|
|
468
|
+
block_seq = block_seq + 1
|
|
469
|
+
text_location = self.get_location(bbox)
|
|
470
|
+
_text_str = '\n'.join(text_contents)
|
|
471
|
+
self.blocks.append({
|
|
472
|
+
"id": id_srv.get_random_short_id(),
|
|
473
|
+
"page": self.page,
|
|
474
|
+
"seq_no": block_seq,
|
|
475
|
+
"sentence": _text_str,
|
|
476
|
+
"type": 'sentence',
|
|
477
|
+
"image_detail": [],
|
|
478
|
+
"text_location": text_location
|
|
479
|
+
})
|
|
480
|
+
pages_body.append(_text_str)
|
|
481
|
+
|
|
462
482
|
return block_seq, pages_body
|
|
463
483
|
|
|
464
484
|
def level_two_image(self, block_seq, pages_block, pages_body):
|
orbitkit-0.8.61/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.61
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|