orbitkit 0.8.33__tar.gz → 0.8.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.33/orbitkit.egg-info → orbitkit-0.8.35}/PKG-INFO +1 -1
- orbitkit-0.8.35/orbitkit/VERSION +1 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +18 -21
- {orbitkit-0.8.33 → orbitkit-0.8.35/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.33/orbitkit/VERSION +0 -1
- {orbitkit-0.8.33 → orbitkit-0.8.35}/LICENSE +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/MANIFEST.in +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/README.md +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/setup.cfg +0 -0
- {orbitkit-0.8.33 → orbitkit-0.8.35}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.35
|
|
@@ -13,11 +13,11 @@ from orbitkit.util import s3_split_path, S3Util, get_from_dict_or_env, ExtenCons
|
|
|
13
13
|
get_content_type_4_filename
|
|
14
14
|
from typing import Optional
|
|
15
15
|
import urllib3
|
|
16
|
-
from retry import retry
|
|
17
16
|
import fitz # PyMuPDF
|
|
18
17
|
import os
|
|
19
18
|
from tqdm import tqdm
|
|
20
19
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
23
23
|
|
|
@@ -54,8 +54,8 @@ class PdfExtractorNetmindFileAnalysis:
|
|
|
54
54
|
aws_access_key_id=self.aws_access_key_id,
|
|
55
55
|
aws_secret_access_key=self.aws_secret_access_key)
|
|
56
56
|
self.s3_resource = boto3.resource('s3',
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
58
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
59
59
|
|
|
60
60
|
def upload_file_to_s3(self, local_key: str, remote_key: str):
|
|
61
61
|
_remote_key = f'{self.bucket_tmp_group}{remote_key}'
|
|
@@ -147,18 +147,17 @@ class PdfExtractorNetmindExtract:
|
|
|
147
147
|
s3_path_obj = s3_split_path(self.s3_path)
|
|
148
148
|
presigned_url = self._generate_presigned_url(s3_path_obj)
|
|
149
149
|
logger.warning("Get presigned_url successfully...")
|
|
150
|
-
json_response = self.get_netmind_response(presigned_url)
|
|
151
|
-
return self._save_json_to_s3(json_response, s3_path_obj)
|
|
150
|
+
api_response_time, json_response = self.get_netmind_response(presigned_url)
|
|
151
|
+
return api_response_time, self._save_json_to_s3(json_response, s3_path_obj)
|
|
152
152
|
|
|
153
153
|
def _generate_presigned_url(self, s3_path_obj):
|
|
154
154
|
return self.s3_client.generate_presigned_url('get_object',
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
155
|
+
Params={
|
|
156
|
+
'Bucket': s3_path_obj["bucket"],
|
|
157
|
+
'Key': s3_path_obj["store_path"]
|
|
158
|
+
},
|
|
159
|
+
ExpiresIn=604800)
|
|
160
160
|
|
|
161
|
-
@retry(tries=2, delay=4)
|
|
162
161
|
def get_netmind_response(self, presigned_url):
|
|
163
162
|
start = time.time()
|
|
164
163
|
files = {"url": (None, presigned_url)}
|
|
@@ -167,8 +166,9 @@ class PdfExtractorNetmindExtract:
|
|
|
167
166
|
)
|
|
168
167
|
# 状态检查
|
|
169
168
|
response.raise_for_status()
|
|
170
|
-
|
|
171
|
-
|
|
169
|
+
api_response_time = time.time() - start
|
|
170
|
+
logger.info(f"Extract text by using Netmind successfully: {api_response_time}")
|
|
171
|
+
return api_response_time, response.json()
|
|
172
172
|
|
|
173
173
|
def _save_json_to_s3(self, json_data, s3_path_obj):
|
|
174
174
|
json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
|
|
@@ -178,12 +178,12 @@ class PdfExtractorNetmindExtract:
|
|
|
178
178
|
input_folder = os.path.join(tmp_dir, 'input')
|
|
179
179
|
if not os.path.exists(input_folder):
|
|
180
180
|
os.makedirs(input_folder)
|
|
181
|
-
local_key = os.path.join(input_folder, local_name)
|
|
181
|
+
local_key = os.path.join(input_folder, local_name) # 临时文件路径
|
|
182
182
|
# 将 JSON 数据写入临时文件
|
|
183
183
|
with open(local_key, 'w') as json_file:
|
|
184
184
|
json_file.write(json_content)
|
|
185
185
|
# 上传到 S3
|
|
186
|
-
self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
|
|
186
|
+
self.upload_file_to_s3(s3_path_obj['bucket'], local_key, json_key)
|
|
187
187
|
return f"s3://{s3_path_obj['bucket']}/{json_key}"
|
|
188
188
|
|
|
189
189
|
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
@@ -191,7 +191,6 @@ class PdfExtractorNetmindExtract:
|
|
|
191
191
|
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
192
192
|
|
|
193
193
|
|
|
194
|
-
|
|
195
194
|
class PdfExtractorNetmindMerge:
|
|
196
195
|
def __init__(self,
|
|
197
196
|
source_s3_path: str = None,
|
|
@@ -223,17 +222,14 @@ class PdfExtractorNetmindMerge:
|
|
|
223
222
|
self._s3_resource = self.s3_util.get_s3_resource()
|
|
224
223
|
self._s3_client = self.s3_util.get_s3_client()
|
|
225
224
|
|
|
226
|
-
|
|
227
225
|
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
228
226
|
self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
|
|
229
227
|
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
230
228
|
|
|
231
|
-
|
|
232
229
|
def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
|
|
233
230
|
self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
|
|
234
231
|
logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
|
|
235
232
|
|
|
236
|
-
|
|
237
233
|
def megre_json(self, json_s3_path_list):
|
|
238
234
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
239
235
|
# page 字典
|
|
@@ -248,7 +244,8 @@ class PdfExtractorNetmindMerge:
|
|
|
248
244
|
with open(local_path, 'r') as file:
|
|
249
245
|
_split_response_json = json.load(file)
|
|
250
246
|
file_item_name = _s3_path.split('/')[-1].replace('.json', '')
|
|
251
|
-
start_page = (int(
|
|
247
|
+
start_page = (int(
|
|
248
|
+
file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
|
|
252
249
|
meta_data[start_page] = _split_response_json
|
|
253
250
|
logger.warning("[JSON] Down json result successfully...")
|
|
254
251
|
# 合并
|
|
@@ -441,4 +438,4 @@ class PdfExtractorNetmindMerge:
|
|
|
441
438
|
|
|
442
439
|
if block_raw["type"] == "image":
|
|
443
440
|
f_block["image_detail"] = block_raw["image_detail"]
|
|
444
|
-
return f_block
|
|
441
|
+
return f_block
|
orbitkit-0.8.33/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.33
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|