orbitkit 0.8.35__tar.gz → 0.8.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {orbitkit-0.8.35/orbitkit.egg-info → orbitkit-0.8.36}/PKG-INFO +1 -1
  2. orbitkit-0.8.36/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +9 -4
  4. {orbitkit-0.8.35 → orbitkit-0.8.36/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.35/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.35 → orbitkit-0.8.36}/LICENSE +0 -0
  7. {orbitkit-0.8.35 → orbitkit-0.8.36}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.35 → orbitkit-0.8.36}/README.md +0 -0
  9. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  38. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  39. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  40. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  41. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  42. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/common.py +0 -0
  56. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/customize_regix_manager.py +0 -0
  57. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/secret_manager.py +0 -0
  58. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aliyun.py +0 -0
  59. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  60. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aws.py +0 -0
  61. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  62. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_date.py +0 -0
  63. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_html.py +0 -0
  64. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_kafka.py +0 -0
  65. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_md5.py +0 -0
  66. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_selenium.py +0 -0
  67. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_simple_timer.py +0 -0
  68. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_str.py +0 -0
  69. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_type_mapping.py +0 -0
  70. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit/util/util_url.py +0 -0
  71. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/SOURCES.txt +0 -0
  72. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/dependency_links.txt +0 -0
  73. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/not-zip-safe +0 -0
  74. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/requires.txt +0 -0
  75. {orbitkit-0.8.35 → orbitkit-0.8.36}/orbitkit.egg-info/top_level.txt +0 -0
  76. {orbitkit-0.8.35 → orbitkit-0.8.36}/setup.cfg +0 -0
  77. {orbitkit-0.8.35 → orbitkit-0.8.36}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.35
3
+ Version: 0.8.36
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.36
@@ -197,6 +197,7 @@ class PdfExtractorNetmindMerge:
197
197
  temp_folder: Optional[str] = None,
198
198
  s3_util: Optional[S3Util] = None,
199
199
  txt_vector: str = 'txt-vector',
200
+ is_page_number_discontinuity_exception_thrown: bool = False, #页码不连续异常抛出
200
201
  slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
201
202
  **kwargs):
202
203
  self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
@@ -205,6 +206,7 @@ class PdfExtractorNetmindMerge:
205
206
  self.txt_vector = txt_vector
206
207
  self.slice_option = slice_option
207
208
  self.source_s3_path = source_s3_path
209
+ self.is_page_number_discontinuity_exception_thrown = is_page_number_discontinuity_exception_thrown
208
210
  if not source_s3_path:
209
211
  raise Exception('not params source source_s3_path')
210
212
  if s3_util:
@@ -266,10 +268,13 @@ class PdfExtractorNetmindMerge:
266
268
  for i in range(1, len(all_pages)):
267
269
  if all_pages[i] != all_pages[i - 1] + 1:
268
270
  missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
269
- raise ValueError(
270
- f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
271
- f"缺少页码: {missing_pages}"
272
- )
271
+ if self.is_page_number_discontinuity_exception_thrown:
272
+ raise ValueError(
273
+ f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
274
+ f"缺少页码: {missing_pages}"
275
+ )
276
+ else:
277
+ print(f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,缺少页码: {missing_pages}")
273
278
  logger.info("[JSON] Merge json result successfully...")
274
279
  return sorted_file_arr
275
280
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.35
3
+ Version: 0.8.36
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.35
File without changes
File without changes
File without changes
File without changes
File without changes