orbitkit 0.8.34__tar.gz → 0.8.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {orbitkit-0.8.34/orbitkit.egg-info → orbitkit-0.8.36}/PKG-INFO +1 -1
  2. orbitkit-0.8.36/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +27 -23
  4. {orbitkit-0.8.34 → orbitkit-0.8.36/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.34/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.34 → orbitkit-0.8.36}/LICENSE +0 -0
  7. {orbitkit-0.8.34 → orbitkit-0.8.36}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.34 → orbitkit-0.8.36}/README.md +0 -0
  9. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  38. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  39. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  40. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  41. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  42. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/common.py +0 -0
  56. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/customize_regix_manager.py +0 -0
  57. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/secret_manager.py +0 -0
  58. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aliyun.py +0 -0
  59. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  60. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aws.py +0 -0
  61. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  62. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_date.py +0 -0
  63. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_html.py +0 -0
  64. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_kafka.py +0 -0
  65. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_md5.py +0 -0
  66. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_selenium.py +0 -0
  67. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_simple_timer.py +0 -0
  68. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_str.py +0 -0
  69. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_type_mapping.py +0 -0
  70. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit/util/util_url.py +0 -0
  71. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/SOURCES.txt +0 -0
  72. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/dependency_links.txt +0 -0
  73. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/not-zip-safe +0 -0
  74. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/requires.txt +0 -0
  75. {orbitkit-0.8.34 → orbitkit-0.8.36}/orbitkit.egg-info/top_level.txt +0 -0
  76. {orbitkit-0.8.34 → orbitkit-0.8.36}/setup.cfg +0 -0
  77. {orbitkit-0.8.34 → orbitkit-0.8.36}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.34
3
+ Version: 0.8.36
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.36
@@ -17,6 +17,7 @@ import fitz # PyMuPDF
17
17
  import os
18
18
  from tqdm import tqdm
19
19
  from concurrent.futures import ThreadPoolExecutor, as_completed
20
+
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
23
 
@@ -53,8 +54,8 @@ class PdfExtractorNetmindFileAnalysis:
53
54
  aws_access_key_id=self.aws_access_key_id,
54
55
  aws_secret_access_key=self.aws_secret_access_key)
55
56
  self.s3_resource = boto3.resource('s3',
56
- aws_access_key_id=self.aws_access_key_id,
57
- aws_secret_access_key=self.aws_secret_access_key)
57
+ aws_access_key_id=self.aws_access_key_id,
58
+ aws_secret_access_key=self.aws_secret_access_key)
58
59
 
59
60
  def upload_file_to_s3(self, local_key: str, remote_key: str):
60
61
  _remote_key = f'{self.bucket_tmp_group}{remote_key}'
@@ -146,16 +147,16 @@ class PdfExtractorNetmindExtract:
146
147
  s3_path_obj = s3_split_path(self.s3_path)
147
148
  presigned_url = self._generate_presigned_url(s3_path_obj)
148
149
  logger.warning("Get presigned_url successfully...")
149
- json_response = self.get_netmind_response(presigned_url)
150
- return self._save_json_to_s3(json_response, s3_path_obj)
150
+ api_response_time, json_response = self.get_netmind_response(presigned_url)
151
+ return api_response_time, self._save_json_to_s3(json_response, s3_path_obj)
151
152
 
152
153
  def _generate_presigned_url(self, s3_path_obj):
153
154
  return self.s3_client.generate_presigned_url('get_object',
154
- Params={
155
- 'Bucket': s3_path_obj["bucket"],
156
- 'Key': s3_path_obj["store_path"]
157
- },
158
- ExpiresIn=604800)
155
+ Params={
156
+ 'Bucket': s3_path_obj["bucket"],
157
+ 'Key': s3_path_obj["store_path"]
158
+ },
159
+ ExpiresIn=604800)
159
160
 
160
161
  def get_netmind_response(self, presigned_url):
161
162
  start = time.time()
@@ -165,8 +166,9 @@ class PdfExtractorNetmindExtract:
165
166
  )
166
167
  # 状态检查
167
168
  response.raise_for_status()
168
- logger.info(f"Extract text by using Netmind successfully: {time.time() - start}")
169
- return response.json()
169
+ api_response_time = time.time() - start
170
+ logger.info(f"Extract text by using Netmind successfully: {api_response_time}")
171
+ return api_response_time, response.json()
170
172
 
171
173
  def _save_json_to_s3(self, json_data, s3_path_obj):
172
174
  json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
@@ -176,12 +178,12 @@ class PdfExtractorNetmindExtract:
176
178
  input_folder = os.path.join(tmp_dir, 'input')
177
179
  if not os.path.exists(input_folder):
178
180
  os.makedirs(input_folder)
179
- local_key = os.path.join(input_folder, local_name) # 临时文件路径
181
+ local_key = os.path.join(input_folder, local_name) # 临时文件路径
180
182
  # 将 JSON 数据写入临时文件
181
183
  with open(local_key, 'w') as json_file:
182
184
  json_file.write(json_content)
183
185
  # 上传到 S3
184
- self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
186
+ self.upload_file_to_s3(s3_path_obj['bucket'], local_key, json_key)
185
187
  return f"s3://{s3_path_obj['bucket']}/{json_key}"
186
188
 
187
189
  def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
@@ -189,13 +191,13 @@ class PdfExtractorNetmindExtract:
189
191
  logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
190
192
 
191
193
 
192
-
193
194
  class PdfExtractorNetmindMerge:
194
195
  def __init__(self,
195
196
  source_s3_path: str = None,
196
197
  temp_folder: Optional[str] = None,
197
198
  s3_util: Optional[S3Util] = None,
198
199
  txt_vector: str = 'txt-vector',
200
+ is_page_number_discontinuity_exception_thrown: bool = False, #页码不连续异常抛出
199
201
  slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
200
202
  **kwargs):
201
203
  self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
@@ -204,6 +206,7 @@ class PdfExtractorNetmindMerge:
204
206
  self.txt_vector = txt_vector
205
207
  self.slice_option = slice_option
206
208
  self.source_s3_path = source_s3_path
209
+ self.is_page_number_discontinuity_exception_thrown = is_page_number_discontinuity_exception_thrown
207
210
  if not source_s3_path:
208
211
  raise Exception('not params source source_s3_path')
209
212
  if s3_util:
@@ -221,17 +224,14 @@ class PdfExtractorNetmindMerge:
221
224
  self._s3_resource = self.s3_util.get_s3_resource()
222
225
  self._s3_client = self.s3_util.get_s3_client()
223
226
 
224
-
225
227
  def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
226
228
  self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
227
229
  logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
228
230
 
229
-
230
231
  def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
231
232
  self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
232
233
  logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
233
234
 
234
-
235
235
  def megre_json(self, json_s3_path_list):
236
236
  with tempfile.TemporaryDirectory() as tmp_dir:
237
237
  # page 字典
@@ -246,7 +246,8 @@ class PdfExtractorNetmindMerge:
246
246
  with open(local_path, 'r') as file:
247
247
  _split_response_json = json.load(file)
248
248
  file_item_name = _s3_path.split('/')[-1].replace('.json', '')
249
- start_page = (int(file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
249
+ start_page = (int(
250
+ file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
250
251
  meta_data[start_page] = _split_response_json
251
252
  logger.warning("[JSON] Down json result successfully...")
252
253
  # 合并
@@ -267,10 +268,13 @@ class PdfExtractorNetmindMerge:
267
268
  for i in range(1, len(all_pages)):
268
269
  if all_pages[i] != all_pages[i - 1] + 1:
269
270
  missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
270
- raise ValueError(
271
- f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
272
- f"缺少页码: {missing_pages}"
273
- )
271
+ if self.is_page_number_discontinuity_exception_thrown:
272
+ raise ValueError(
273
+ f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
274
+ f"缺少页码: {missing_pages}"
275
+ )
276
+ else:
277
+ print(f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,缺少页码: {missing_pages}")
274
278
  logger.info("[JSON] Merge json result successfully...")
275
279
  return sorted_file_arr
276
280
 
@@ -439,4 +443,4 @@ class PdfExtractorNetmindMerge:
439
443
 
440
444
  if block_raw["type"] == "image":
441
445
  f_block["image_detail"] = block_raw["image_detail"]
442
- return f_block
446
+ return f_block
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.34
3
+ Version: 0.8.36
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.34
File without changes
File without changes
File without changes
File without changes
File without changes