orbitkit 0.8.34__tar.gz → 0.8.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {orbitkit-0.8.34/orbitkit.egg-info → orbitkit-0.8.35}/PKG-INFO +1 -1
  2. orbitkit-0.8.35/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +18 -19
  4. {orbitkit-0.8.34 → orbitkit-0.8.35/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.34/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.34 → orbitkit-0.8.35}/LICENSE +0 -0
  7. {orbitkit-0.8.34 → orbitkit-0.8.35}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.34 → orbitkit-0.8.35}/README.md +0 -0
  9. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  38. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  39. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  40. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  41. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  42. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/common.py +0 -0
  56. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/customize_regix_manager.py +0 -0
  57. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/secret_manager.py +0 -0
  58. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aliyun.py +0 -0
  59. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  60. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aws.py +0 -0
  61. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  62. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_date.py +0 -0
  63. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_html.py +0 -0
  64. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_kafka.py +0 -0
  65. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_md5.py +0 -0
  66. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_selenium.py +0 -0
  67. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_simple_timer.py +0 -0
  68. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_str.py +0 -0
  69. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_type_mapping.py +0 -0
  70. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit/util/util_url.py +0 -0
  71. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/SOURCES.txt +0 -0
  72. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/dependency_links.txt +0 -0
  73. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/not-zip-safe +0 -0
  74. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/requires.txt +0 -0
  75. {orbitkit-0.8.34 → orbitkit-0.8.35}/orbitkit.egg-info/top_level.txt +0 -0
  76. {orbitkit-0.8.34 → orbitkit-0.8.35}/setup.cfg +0 -0
  77. {orbitkit-0.8.34 → orbitkit-0.8.35}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.34
3
+ Version: 0.8.35
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.35
@@ -17,6 +17,7 @@ import fitz # PyMuPDF
17
17
  import os
18
18
  from tqdm import tqdm
19
19
  from concurrent.futures import ThreadPoolExecutor, as_completed
20
+
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
23
 
@@ -53,8 +54,8 @@ class PdfExtractorNetmindFileAnalysis:
53
54
  aws_access_key_id=self.aws_access_key_id,
54
55
  aws_secret_access_key=self.aws_secret_access_key)
55
56
  self.s3_resource = boto3.resource('s3',
56
- aws_access_key_id=self.aws_access_key_id,
57
- aws_secret_access_key=self.aws_secret_access_key)
57
+ aws_access_key_id=self.aws_access_key_id,
58
+ aws_secret_access_key=self.aws_secret_access_key)
58
59
 
59
60
  def upload_file_to_s3(self, local_key: str, remote_key: str):
60
61
  _remote_key = f'{self.bucket_tmp_group}{remote_key}'
@@ -146,16 +147,16 @@ class PdfExtractorNetmindExtract:
146
147
  s3_path_obj = s3_split_path(self.s3_path)
147
148
  presigned_url = self._generate_presigned_url(s3_path_obj)
148
149
  logger.warning("Get presigned_url successfully...")
149
- json_response = self.get_netmind_response(presigned_url)
150
- return self._save_json_to_s3(json_response, s3_path_obj)
150
+ api_response_time, json_response = self.get_netmind_response(presigned_url)
151
+ return api_response_time, self._save_json_to_s3(json_response, s3_path_obj)
151
152
 
152
153
  def _generate_presigned_url(self, s3_path_obj):
153
154
  return self.s3_client.generate_presigned_url('get_object',
154
- Params={
155
- 'Bucket': s3_path_obj["bucket"],
156
- 'Key': s3_path_obj["store_path"]
157
- },
158
- ExpiresIn=604800)
155
+ Params={
156
+ 'Bucket': s3_path_obj["bucket"],
157
+ 'Key': s3_path_obj["store_path"]
158
+ },
159
+ ExpiresIn=604800)
159
160
 
160
161
  def get_netmind_response(self, presigned_url):
161
162
  start = time.time()
@@ -165,8 +166,9 @@ class PdfExtractorNetmindExtract:
165
166
  )
166
167
  # 状态检查
167
168
  response.raise_for_status()
168
- logger.info(f"Extract text by using Netmind successfully: {time.time() - start}")
169
- return response.json()
169
+ api_response_time = time.time() - start
170
+ logger.info(f"Extract text by using Netmind successfully: {api_response_time}")
171
+ return api_response_time, response.json()
170
172
 
171
173
  def _save_json_to_s3(self, json_data, s3_path_obj):
172
174
  json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
@@ -176,12 +178,12 @@ class PdfExtractorNetmindExtract:
176
178
  input_folder = os.path.join(tmp_dir, 'input')
177
179
  if not os.path.exists(input_folder):
178
180
  os.makedirs(input_folder)
179
- local_key = os.path.join(input_folder, local_name) # 临时文件路径
181
+ local_key = os.path.join(input_folder, local_name) # 临时文件路径
180
182
  # 将 JSON 数据写入临时文件
181
183
  with open(local_key, 'w') as json_file:
182
184
  json_file.write(json_content)
183
185
  # 上传到 S3
184
- self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
186
+ self.upload_file_to_s3(s3_path_obj['bucket'], local_key, json_key)
185
187
  return f"s3://{s3_path_obj['bucket']}/{json_key}"
186
188
 
187
189
  def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
@@ -189,7 +191,6 @@ class PdfExtractorNetmindExtract:
189
191
  logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
190
192
 
191
193
 
192
-
193
194
  class PdfExtractorNetmindMerge:
194
195
  def __init__(self,
195
196
  source_s3_path: str = None,
@@ -221,17 +222,14 @@ class PdfExtractorNetmindMerge:
221
222
  self._s3_resource = self.s3_util.get_s3_resource()
222
223
  self._s3_client = self.s3_util.get_s3_client()
223
224
 
224
-
225
225
  def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
226
226
  self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
227
227
  logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
228
228
 
229
-
230
229
  def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
231
230
  self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
232
231
  logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
233
232
 
234
-
235
233
  def megre_json(self, json_s3_path_list):
236
234
  with tempfile.TemporaryDirectory() as tmp_dir:
237
235
  # page 字典
@@ -246,7 +244,8 @@ class PdfExtractorNetmindMerge:
246
244
  with open(local_path, 'r') as file:
247
245
  _split_response_json = json.load(file)
248
246
  file_item_name = _s3_path.split('/')[-1].replace('.json', '')
249
- start_page = (int(file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
247
+ start_page = (int(
248
+ file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
250
249
  meta_data[start_page] = _split_response_json
251
250
  logger.warning("[JSON] Down json result successfully...")
252
251
  # 合并
@@ -439,4 +438,4 @@ class PdfExtractorNetmindMerge:
439
438
 
440
439
  if block_raw["type"] == "image":
441
440
  f_block["image_detail"] = block_raw["image_detail"]
442
- return f_block
441
+ return f_block
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.34
3
+ Version: 0.8.35
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.34
File without changes
File without changes
File without changes
File without changes
File without changes