orbitkit 0.8.34__tar.gz → 0.8.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.34/orbitkit.egg-info → orbitkit-0.8.35}/PKG-INFO +1 -1
- orbitkit-0.8.35/orbitkit/VERSION +1 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +18 -19
- {orbitkit-0.8.34 → orbitkit-0.8.35/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.34/orbitkit/VERSION +0 -1
- {orbitkit-0.8.34 → orbitkit-0.8.35}/LICENSE +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/MANIFEST.in +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/README.md +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/setup.cfg +0 -0
- {orbitkit-0.8.34 → orbitkit-0.8.35}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.35
|
|
@@ -17,6 +17,7 @@ import fitz # PyMuPDF
|
|
|
17
17
|
import os
|
|
18
18
|
from tqdm import tqdm
|
|
19
19
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -53,8 +54,8 @@ class PdfExtractorNetmindFileAnalysis:
|
|
|
53
54
|
aws_access_key_id=self.aws_access_key_id,
|
|
54
55
|
aws_secret_access_key=self.aws_secret_access_key)
|
|
55
56
|
self.s3_resource = boto3.resource('s3',
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
58
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
58
59
|
|
|
59
60
|
def upload_file_to_s3(self, local_key: str, remote_key: str):
|
|
60
61
|
_remote_key = f'{self.bucket_tmp_group}{remote_key}'
|
|
@@ -146,16 +147,16 @@ class PdfExtractorNetmindExtract:
|
|
|
146
147
|
s3_path_obj = s3_split_path(self.s3_path)
|
|
147
148
|
presigned_url = self._generate_presigned_url(s3_path_obj)
|
|
148
149
|
logger.warning("Get presigned_url successfully...")
|
|
149
|
-
json_response = self.get_netmind_response(presigned_url)
|
|
150
|
-
return self._save_json_to_s3(json_response, s3_path_obj)
|
|
150
|
+
api_response_time, json_response = self.get_netmind_response(presigned_url)
|
|
151
|
+
return api_response_time, self._save_json_to_s3(json_response, s3_path_obj)
|
|
151
152
|
|
|
152
153
|
def _generate_presigned_url(self, s3_path_obj):
|
|
153
154
|
return self.s3_client.generate_presigned_url('get_object',
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
155
|
+
Params={
|
|
156
|
+
'Bucket': s3_path_obj["bucket"],
|
|
157
|
+
'Key': s3_path_obj["store_path"]
|
|
158
|
+
},
|
|
159
|
+
ExpiresIn=604800)
|
|
159
160
|
|
|
160
161
|
def get_netmind_response(self, presigned_url):
|
|
161
162
|
start = time.time()
|
|
@@ -165,8 +166,9 @@ class PdfExtractorNetmindExtract:
|
|
|
165
166
|
)
|
|
166
167
|
# 状态检查
|
|
167
168
|
response.raise_for_status()
|
|
168
|
-
|
|
169
|
-
|
|
169
|
+
api_response_time = time.time() - start
|
|
170
|
+
logger.info(f"Extract text by using Netmind successfully: {api_response_time}")
|
|
171
|
+
return api_response_time, response.json()
|
|
170
172
|
|
|
171
173
|
def _save_json_to_s3(self, json_data, s3_path_obj):
|
|
172
174
|
json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
|
|
@@ -176,12 +178,12 @@ class PdfExtractorNetmindExtract:
|
|
|
176
178
|
input_folder = os.path.join(tmp_dir, 'input')
|
|
177
179
|
if not os.path.exists(input_folder):
|
|
178
180
|
os.makedirs(input_folder)
|
|
179
|
-
local_key = os.path.join(input_folder, local_name)
|
|
181
|
+
local_key = os.path.join(input_folder, local_name) # 临时文件路径
|
|
180
182
|
# 将 JSON 数据写入临时文件
|
|
181
183
|
with open(local_key, 'w') as json_file:
|
|
182
184
|
json_file.write(json_content)
|
|
183
185
|
# 上传到 S3
|
|
184
|
-
self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
|
|
186
|
+
self.upload_file_to_s3(s3_path_obj['bucket'], local_key, json_key)
|
|
185
187
|
return f"s3://{s3_path_obj['bucket']}/{json_key}"
|
|
186
188
|
|
|
187
189
|
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
@@ -189,7 +191,6 @@ class PdfExtractorNetmindExtract:
|
|
|
189
191
|
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
190
192
|
|
|
191
193
|
|
|
192
|
-
|
|
193
194
|
class PdfExtractorNetmindMerge:
|
|
194
195
|
def __init__(self,
|
|
195
196
|
source_s3_path: str = None,
|
|
@@ -221,17 +222,14 @@ class PdfExtractorNetmindMerge:
|
|
|
221
222
|
self._s3_resource = self.s3_util.get_s3_resource()
|
|
222
223
|
self._s3_client = self.s3_util.get_s3_client()
|
|
223
224
|
|
|
224
|
-
|
|
225
225
|
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
226
226
|
self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
|
|
227
227
|
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
228
228
|
|
|
229
|
-
|
|
230
229
|
def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
|
|
231
230
|
self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
|
|
232
231
|
logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
|
|
233
232
|
|
|
234
|
-
|
|
235
233
|
def megre_json(self, json_s3_path_list):
|
|
236
234
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
237
235
|
# page 字典
|
|
@@ -246,7 +244,8 @@ class PdfExtractorNetmindMerge:
|
|
|
246
244
|
with open(local_path, 'r') as file:
|
|
247
245
|
_split_response_json = json.load(file)
|
|
248
246
|
file_item_name = _s3_path.split('/')[-1].replace('.json', '')
|
|
249
|
-
start_page = (int(
|
|
247
|
+
start_page = (int(
|
|
248
|
+
file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
|
|
250
249
|
meta_data[start_page] = _split_response_json
|
|
251
250
|
logger.warning("[JSON] Down json result successfully...")
|
|
252
251
|
# 合并
|
|
@@ -439,4 +438,4 @@ class PdfExtractorNetmindMerge:
|
|
|
439
438
|
|
|
440
439
|
if block_raw["type"] == "image":
|
|
441
440
|
f_block["image_detail"] = block_raw["image_detail"]
|
|
442
|
-
return f_block
|
|
441
|
+
return f_block
|
orbitkit-0.8.34/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.34
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|