orbitkit 0.8.33__tar.gz → 0.8.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {orbitkit-0.8.33/orbitkit.egg-info → orbitkit-0.8.35}/PKG-INFO +1 -1
  2. orbitkit-0.8.35/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +18 -21
  4. {orbitkit-0.8.33 → orbitkit-0.8.35/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.33/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.33 → orbitkit-0.8.35}/LICENSE +0 -0
  7. {orbitkit-0.8.33 → orbitkit-0.8.35}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.33 → orbitkit-0.8.35}/README.md +0 -0
  9. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  38. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  39. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  40. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  41. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  42. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/common.py +0 -0
  56. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/customize_regix_manager.py +0 -0
  57. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/secret_manager.py +0 -0
  58. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aliyun.py +0 -0
  59. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  60. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aws.py +0 -0
  61. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  62. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_date.py +0 -0
  63. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_html.py +0 -0
  64. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_kafka.py +0 -0
  65. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_md5.py +0 -0
  66. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_selenium.py +0 -0
  67. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_simple_timer.py +0 -0
  68. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_str.py +0 -0
  69. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_type_mapping.py +0 -0
  70. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit/util/util_url.py +0 -0
  71. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/SOURCES.txt +0 -0
  72. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/dependency_links.txt +0 -0
  73. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/not-zip-safe +0 -0
  74. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/requires.txt +0 -0
  75. {orbitkit-0.8.33 → orbitkit-0.8.35}/orbitkit.egg-info/top_level.txt +0 -0
  76. {orbitkit-0.8.33 → orbitkit-0.8.35}/setup.cfg +0 -0
  77. {orbitkit-0.8.33 → orbitkit-0.8.35}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.33
3
+ Version: 0.8.35
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.35
@@ -13,11 +13,11 @@ from orbitkit.util import s3_split_path, S3Util, get_from_dict_or_env, ExtenCons
13
13
  get_content_type_4_filename
14
14
  from typing import Optional
15
15
  import urllib3
16
- from retry import retry
17
16
  import fitz # PyMuPDF
18
17
  import os
19
18
  from tqdm import tqdm
20
19
  from concurrent.futures import ThreadPoolExecutor, as_completed
20
+
21
21
  logger = logging.getLogger(__name__)
22
22
 
23
23
 
@@ -54,8 +54,8 @@ class PdfExtractorNetmindFileAnalysis:
54
54
  aws_access_key_id=self.aws_access_key_id,
55
55
  aws_secret_access_key=self.aws_secret_access_key)
56
56
  self.s3_resource = boto3.resource('s3',
57
- aws_access_key_id=self.aws_access_key_id,
58
- aws_secret_access_key=self.aws_secret_access_key)
57
+ aws_access_key_id=self.aws_access_key_id,
58
+ aws_secret_access_key=self.aws_secret_access_key)
59
59
 
60
60
  def upload_file_to_s3(self, local_key: str, remote_key: str):
61
61
  _remote_key = f'{self.bucket_tmp_group}{remote_key}'
@@ -147,18 +147,17 @@ class PdfExtractorNetmindExtract:
147
147
  s3_path_obj = s3_split_path(self.s3_path)
148
148
  presigned_url = self._generate_presigned_url(s3_path_obj)
149
149
  logger.warning("Get presigned_url successfully...")
150
- json_response = self.get_netmind_response(presigned_url)
151
- return self._save_json_to_s3(json_response, s3_path_obj)
150
+ api_response_time, json_response = self.get_netmind_response(presigned_url)
151
+ return api_response_time, self._save_json_to_s3(json_response, s3_path_obj)
152
152
 
153
153
  def _generate_presigned_url(self, s3_path_obj):
154
154
  return self.s3_client.generate_presigned_url('get_object',
155
- Params={
156
- 'Bucket': s3_path_obj["bucket"],
157
- 'Key': s3_path_obj["store_path"]
158
- },
159
- ExpiresIn=604800)
155
+ Params={
156
+ 'Bucket': s3_path_obj["bucket"],
157
+ 'Key': s3_path_obj["store_path"]
158
+ },
159
+ ExpiresIn=604800)
160
160
 
161
- @retry(tries=2, delay=4)
162
161
  def get_netmind_response(self, presigned_url):
163
162
  start = time.time()
164
163
  files = {"url": (None, presigned_url)}
@@ -167,8 +166,9 @@ class PdfExtractorNetmindExtract:
167
166
  )
168
167
  # 状态检查
169
168
  response.raise_for_status()
170
- logger.info(f"Extract text by using Netmind successfully: {time.time() - start}")
171
- return response.json()
169
+ api_response_time = time.time() - start
170
+ logger.info(f"Extract text by using Netmind successfully: {api_response_time}")
171
+ return api_response_time, response.json()
172
172
 
173
173
  def _save_json_to_s3(self, json_data, s3_path_obj):
174
174
  json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
@@ -178,12 +178,12 @@ class PdfExtractorNetmindExtract:
178
178
  input_folder = os.path.join(tmp_dir, 'input')
179
179
  if not os.path.exists(input_folder):
180
180
  os.makedirs(input_folder)
181
- local_key = os.path.join(input_folder, local_name) # 临时文件路径
181
+ local_key = os.path.join(input_folder, local_name) # 临时文件路径
182
182
  # 将 JSON 数据写入临时文件
183
183
  with open(local_key, 'w') as json_file:
184
184
  json_file.write(json_content)
185
185
  # 上传到 S3
186
- self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
186
+ self.upload_file_to_s3(s3_path_obj['bucket'], local_key, json_key)
187
187
  return f"s3://{s3_path_obj['bucket']}/{json_key}"
188
188
 
189
189
  def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
@@ -191,7 +191,6 @@ class PdfExtractorNetmindExtract:
191
191
  logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
192
192
 
193
193
 
194
-
195
194
  class PdfExtractorNetmindMerge:
196
195
  def __init__(self,
197
196
  source_s3_path: str = None,
@@ -223,17 +222,14 @@ class PdfExtractorNetmindMerge:
223
222
  self._s3_resource = self.s3_util.get_s3_resource()
224
223
  self._s3_client = self.s3_util.get_s3_client()
225
224
 
226
-
227
225
  def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
228
226
  self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
229
227
  logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
230
228
 
231
-
232
229
  def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
233
230
  self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
234
231
  logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
235
232
 
236
-
237
233
  def megre_json(self, json_s3_path_list):
238
234
  with tempfile.TemporaryDirectory() as tmp_dir:
239
235
  # page 字典
@@ -248,7 +244,8 @@ class PdfExtractorNetmindMerge:
248
244
  with open(local_path, 'r') as file:
249
245
  _split_response_json = json.load(file)
250
246
  file_item_name = _s3_path.split('/')[-1].replace('.json', '')
251
- start_page = (int(file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
247
+ start_page = (int(
248
+ file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
252
249
  meta_data[start_page] = _split_response_json
253
250
  logger.warning("[JSON] Down json result successfully...")
254
251
  # 合并
@@ -441,4 +438,4 @@ class PdfExtractorNetmindMerge:
441
438
 
442
439
  if block_raw["type"] == "image":
443
440
  f_block["image_detail"] = block_raw["image_detail"]
444
- return f_block
441
+ return f_block
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.33
3
+ Version: 0.8.35
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.33
File without changes
File without changes
File without changes
File without changes
File without changes