orbitkit 0.8.34__tar.gz → 0.8.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.34/orbitkit.egg-info → orbitkit-0.8.36}/PKG-INFO +1 -1
- orbitkit-0.8.36/orbitkit/VERSION +1 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +27 -23
- {orbitkit-0.8.34 → orbitkit-0.8.36/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.34/orbitkit/VERSION +0 -1
- {orbitkit-0.8.34 → orbitkit-0.8.36}/LICENSE +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/MANIFEST.in +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/README.md +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/setup.cfg +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.36}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.36
|
|
@@ -17,6 +17,7 @@ import fitz # PyMuPDF
|
|
|
17
17
|
import os
|
|
18
18
|
from tqdm import tqdm
|
|
19
19
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -53,8 +54,8 @@ class PdfExtractorNetmindFileAnalysis:
|
|
|
53
54
|
aws_access_key_id=self.aws_access_key_id,
|
|
54
55
|
aws_secret_access_key=self.aws_secret_access_key)
|
|
55
56
|
self.s3_resource = boto3.resource('s3',
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
58
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
58
59
|
|
|
59
60
|
def upload_file_to_s3(self, local_key: str, remote_key: str):
|
|
60
61
|
_remote_key = f'{self.bucket_tmp_group}{remote_key}'
|
|
@@ -146,16 +147,16 @@ class PdfExtractorNetmindExtract:
|
|
|
146
147
|
s3_path_obj = s3_split_path(self.s3_path)
|
|
147
148
|
presigned_url = self._generate_presigned_url(s3_path_obj)
|
|
148
149
|
logger.warning("Get presigned_url successfully...")
|
|
149
|
-
json_response = self.get_netmind_response(presigned_url)
|
|
150
|
-
return self._save_json_to_s3(json_response, s3_path_obj)
|
|
150
|
+
api_response_time, json_response = self.get_netmind_response(presigned_url)
|
|
151
|
+
return api_response_time, self._save_json_to_s3(json_response, s3_path_obj)
|
|
151
152
|
|
|
152
153
|
def _generate_presigned_url(self, s3_path_obj):
|
|
153
154
|
return self.s3_client.generate_presigned_url('get_object',
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
155
|
+
Params={
|
|
156
|
+
'Bucket': s3_path_obj["bucket"],
|
|
157
|
+
'Key': s3_path_obj["store_path"]
|
|
158
|
+
},
|
|
159
|
+
ExpiresIn=604800)
|
|
159
160
|
|
|
160
161
|
def get_netmind_response(self, presigned_url):
|
|
161
162
|
start = time.time()
|
|
@@ -165,8 +166,9 @@ class PdfExtractorNetmindExtract:
|
|
|
165
166
|
)
|
|
166
167
|
# 状态检查
|
|
167
168
|
response.raise_for_status()
|
|
168
|
-
|
|
169
|
-
|
|
169
|
+
api_response_time = time.time() - start
|
|
170
|
+
logger.info(f"Extract text by using Netmind successfully: {api_response_time}")
|
|
171
|
+
return api_response_time, response.json()
|
|
170
172
|
|
|
171
173
|
def _save_json_to_s3(self, json_data, s3_path_obj):
|
|
172
174
|
json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
|
|
@@ -176,12 +178,12 @@ class PdfExtractorNetmindExtract:
|
|
|
176
178
|
input_folder = os.path.join(tmp_dir, 'input')
|
|
177
179
|
if not os.path.exists(input_folder):
|
|
178
180
|
os.makedirs(input_folder)
|
|
179
|
-
local_key = os.path.join(input_folder, local_name)
|
|
181
|
+
local_key = os.path.join(input_folder, local_name) # 临时文件路径
|
|
180
182
|
# 将 JSON 数据写入临时文件
|
|
181
183
|
with open(local_key, 'w') as json_file:
|
|
182
184
|
json_file.write(json_content)
|
|
183
185
|
# 上传到 S3
|
|
184
|
-
self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
|
|
186
|
+
self.upload_file_to_s3(s3_path_obj['bucket'], local_key, json_key)
|
|
185
187
|
return f"s3://{s3_path_obj['bucket']}/{json_key}"
|
|
186
188
|
|
|
187
189
|
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
@@ -189,13 +191,13 @@ class PdfExtractorNetmindExtract:
|
|
|
189
191
|
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
190
192
|
|
|
191
193
|
|
|
192
|
-
|
|
193
194
|
class PdfExtractorNetmindMerge:
|
|
194
195
|
def __init__(self,
|
|
195
196
|
source_s3_path: str = None,
|
|
196
197
|
temp_folder: Optional[str] = None,
|
|
197
198
|
s3_util: Optional[S3Util] = None,
|
|
198
199
|
txt_vector: str = 'txt-vector',
|
|
200
|
+
is_page_number_discontinuity_exception_thrown: bool = False, #页码不连续异常抛出
|
|
199
201
|
slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
|
|
200
202
|
**kwargs):
|
|
201
203
|
self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
|
|
@@ -204,6 +206,7 @@ class PdfExtractorNetmindMerge:
|
|
|
204
206
|
self.txt_vector = txt_vector
|
|
205
207
|
self.slice_option = slice_option
|
|
206
208
|
self.source_s3_path = source_s3_path
|
|
209
|
+
self.is_page_number_discontinuity_exception_thrown = is_page_number_discontinuity_exception_thrown
|
|
207
210
|
if not source_s3_path:
|
|
208
211
|
raise Exception('not params source source_s3_path')
|
|
209
212
|
if s3_util:
|
|
@@ -221,17 +224,14 @@ class PdfExtractorNetmindMerge:
|
|
|
221
224
|
self._s3_resource = self.s3_util.get_s3_resource()
|
|
222
225
|
self._s3_client = self.s3_util.get_s3_client()
|
|
223
226
|
|
|
224
|
-
|
|
225
227
|
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
226
228
|
self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
|
|
227
229
|
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
228
230
|
|
|
229
|
-
|
|
230
231
|
def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
|
|
231
232
|
self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
|
|
232
233
|
logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
|
|
233
234
|
|
|
234
|
-
|
|
235
235
|
def megre_json(self, json_s3_path_list):
|
|
236
236
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
237
237
|
# page 字典
|
|
@@ -246,7 +246,8 @@ class PdfExtractorNetmindMerge:
|
|
|
246
246
|
with open(local_path, 'r') as file:
|
|
247
247
|
_split_response_json = json.load(file)
|
|
248
248
|
file_item_name = _s3_path.split('/')[-1].replace('.json', '')
|
|
249
|
-
start_page = (int(
|
|
249
|
+
start_page = (int(
|
|
250
|
+
file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
|
|
250
251
|
meta_data[start_page] = _split_response_json
|
|
251
252
|
logger.warning("[JSON] Down json result successfully...")
|
|
252
253
|
# 合并
|
|
@@ -267,10 +268,13 @@ class PdfExtractorNetmindMerge:
|
|
|
267
268
|
for i in range(1, len(all_pages)):
|
|
268
269
|
if all_pages[i] != all_pages[i - 1] + 1:
|
|
269
270
|
missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
271
|
+
if self.is_page_number_discontinuity_exception_thrown:
|
|
272
|
+
raise ValueError(
|
|
273
|
+
f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
|
|
274
|
+
f"缺少页码: {missing_pages}"
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
print(f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,缺少页码: {missing_pages}")
|
|
274
278
|
logger.info("[JSON] Merge json result successfully...")
|
|
275
279
|
return sorted_file_arr
|
|
276
280
|
|
|
@@ -439,4 +443,4 @@ class PdfExtractorNetmindMerge:
|
|
|
439
443
|
|
|
440
444
|
if block_raw["type"] == "image":
|
|
441
445
|
f_block["image_detail"] = block_raw["image_detail"]
|
|
442
|
-
return f_block
|
|
446
|
+
return f_block
|
orbitkit-0.8.34/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.34
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|