orbitkit 0.8.35__tar.gz → 0.8.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.35/orbitkit.egg-info → orbitkit-0.8.36}/PKG-INFO +1 -1
- orbitkit-0.8.36/orbitkit/VERSION +1 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +9 -4
- {orbitkit-0.8.35 → orbitkit-0.8.36/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.35/orbitkit/VERSION +0 -1
- {orbitkit-0.8.35 → orbitkit-0.8.36}/LICENSE +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/MANIFEST.in +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/README.md +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/setup.cfg +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.36}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.36
|
|
@@ -197,6 +197,7 @@ class PdfExtractorNetmindMerge:
|
|
|
197
197
|
temp_folder: Optional[str] = None,
|
|
198
198
|
s3_util: Optional[S3Util] = None,
|
|
199
199
|
txt_vector: str = 'txt-vector',
|
|
200
|
+
is_page_number_discontinuity_exception_thrown: bool = False, #页码不连续异常抛出
|
|
200
201
|
slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
|
|
201
202
|
**kwargs):
|
|
202
203
|
self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
|
|
@@ -205,6 +206,7 @@ class PdfExtractorNetmindMerge:
|
|
|
205
206
|
self.txt_vector = txt_vector
|
|
206
207
|
self.slice_option = slice_option
|
|
207
208
|
self.source_s3_path = source_s3_path
|
|
209
|
+
self.is_page_number_discontinuity_exception_thrown = is_page_number_discontinuity_exception_thrown
|
|
208
210
|
if not source_s3_path:
|
|
209
211
|
raise Exception('not params source source_s3_path')
|
|
210
212
|
if s3_util:
|
|
@@ -266,10 +268,13 @@ class PdfExtractorNetmindMerge:
|
|
|
266
268
|
for i in range(1, len(all_pages)):
|
|
267
269
|
if all_pages[i] != all_pages[i - 1] + 1:
|
|
268
270
|
missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
271
|
+
if self.is_page_number_discontinuity_exception_thrown:
|
|
272
|
+
raise ValueError(
|
|
273
|
+
f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
|
|
274
|
+
f"缺少页码: {missing_pages}"
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
print(f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,缺少页码: {missing_pages}")
|
|
273
278
|
logger.info("[JSON] Merge json result successfully...")
|
|
274
279
|
return sorted_file_arr
|
|
275
280
|
|
orbitkit-0.8.35/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.35
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|