orbitkit 0.8.75__tar.gz → 0.8.76__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.75/orbitkit.egg-info → orbitkit-0.8.76}/PKG-INFO +1 -1
- orbitkit-0.8.76/orbitkit/VERSION +1 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +34 -3
- {orbitkit-0.8.75 → orbitkit-0.8.76/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.75/orbitkit/VERSION +0 -1
- {orbitkit-0.8.75 → orbitkit-0.8.76}/LICENSE +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/MANIFEST.in +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/README.md +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/mineru_demo.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/is_xbrl_structure.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/universal_extractor.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/setup.cfg +0 -0
- {orbitkit-0.8.75 → orbitkit-0.8.76}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.76
|
|
@@ -54,8 +54,8 @@ class MinerUApiBase:
|
|
|
54
54
|
|
|
55
55
|
BASE_URL = "https://mineru.net/api/v4"
|
|
56
56
|
MAX_FILE_SIZE_MB = 200 # 最大文件大小 MB
|
|
57
|
-
MAX_PAGES =
|
|
58
|
-
SPLIT_PAGES =
|
|
57
|
+
MAX_PAGES = 200 # 最大页数(MinerU 服务端硬限制)
|
|
58
|
+
SPLIT_PAGES = 150 # 拆分时每个分片的页数(留 10% 余量避免边界踩限)
|
|
59
59
|
MAX_BATCH_FILES = 200 # 单次批量上传最大文件数
|
|
60
60
|
|
|
61
61
|
def __init__(self, api_token: str, temp_dir: str = None):
|
|
@@ -543,6 +543,37 @@ class MinerUExtract:
|
|
|
543
543
|
block_seq = block_seq - 1
|
|
544
544
|
return block_seq, pages_body
|
|
545
545
|
|
|
546
|
+
def level_two_chart(self, block_seq, pages_block, pages_body):
|
|
547
|
+
pages_chart_blocks = pages_block["blocks"]
|
|
548
|
+
for _pages_chart_blocks in pages_chart_blocks:
|
|
549
|
+
block_seq = block_seq + 1
|
|
550
|
+
_pages_chart_blocks_type = _pages_chart_blocks['type']
|
|
551
|
+
pages_chart_blocks_line = _pages_chart_blocks["lines"]
|
|
552
|
+
bbox = pages_block['bbox']
|
|
553
|
+
if _pages_chart_blocks_type == 'chart_body':
|
|
554
|
+
block_type = 'chart'
|
|
555
|
+
elif _pages_chart_blocks_type in ['chart_caption', 'chart_footnote']:
|
|
556
|
+
block_type = 'sentence'
|
|
557
|
+
else:
|
|
558
|
+
raise Exception(f'Chart 异常目标值 {_pages_chart_blocks_type}:')
|
|
559
|
+
text_location = self.get_location(bbox)
|
|
560
|
+
_block_arr, _image_detail_arr = self.get_com_lines(pages_chart_blocks_line)
|
|
561
|
+
if len(_block_arr) > 0 or len(_image_detail_arr) > 0:
|
|
562
|
+
_block_arr_str = '\n'.join(_block_arr)
|
|
563
|
+
self.blocks.append({
|
|
564
|
+
"id": id_srv.get_random_short_id(),
|
|
565
|
+
"page": self.page,
|
|
566
|
+
"seq_no": block_seq,
|
|
567
|
+
"sentence": _block_arr_str,
|
|
568
|
+
"type": block_type,
|
|
569
|
+
"image_detail": _image_detail_arr,
|
|
570
|
+
"text_location": text_location
|
|
571
|
+
})
|
|
572
|
+
else:
|
|
573
|
+
if block_seq > 0:
|
|
574
|
+
block_seq = block_seq - 1
|
|
575
|
+
return block_seq, pages_body
|
|
576
|
+
|
|
546
577
|
def level_two_interline_equation(self, block_seq, pages_block, pages_body):
|
|
547
578
|
block_seq = block_seq + 1
|
|
548
579
|
bbox = pages_block['bbox']
|
|
@@ -602,7 +633,7 @@ class MinerUExtract:
|
|
|
602
633
|
_c_s.append(_l['content'])
|
|
603
634
|
elif _l['type'] in ['table']:
|
|
604
635
|
_c_s.append(_l['html'])
|
|
605
|
-
elif _l['type'] in ['image']:
|
|
636
|
+
elif _l['type'] in ['image', 'chart']:
|
|
606
637
|
pass
|
|
607
638
|
else:
|
|
608
639
|
raise Exception(f'类型匹配异常 意外值: {_l["type"]}')
|
orbitkit-0.8.75/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.75
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|