orbitkit 0.8.32__tar.gz → 0.8.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {orbitkit-0.8.32/orbitkit.egg-info → orbitkit-0.8.34}/PKG-INFO +1 -1
  2. orbitkit-0.8.34/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +6 -2
  4. {orbitkit-0.8.32 → orbitkit-0.8.34/orbitkit.egg-info}/PKG-INFO +1 -1
  5. orbitkit-0.8.32/orbitkit/VERSION +0 -1
  6. {orbitkit-0.8.32 → orbitkit-0.8.34}/LICENSE +0 -0
  7. {orbitkit-0.8.32 → orbitkit-0.8.34}/MANIFEST.in +0 -0
  8. {orbitkit-0.8.32 → orbitkit-0.8.34}/README.md +0 -0
  9. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/__init__.py +0 -0
  10. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/airflow_handler/__init__.py +0 -0
  11. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  12. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  13. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  16. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/audio_transcoder/__init__.py +0 -0
  17. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  18. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/constant/__init__.py +0 -0
  19. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/constant/report_schema.py +0 -0
  20. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/id_srv/__init__.py +0 -0
  21. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/id_srv/id_gen.py +0 -0
  22. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/id_srv/id_perm_like.py +0 -0
  23. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/lark_send/__init__.py +0 -0
  24. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/lark_send/lark.py +0 -0
  25. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/llm_tools/__init__.py +0 -0
  26. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  27. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/orbit_type/__init__.py +0 -0
  28. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  29. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  30. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/orbit_type/tools.py +0 -0
  31. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_embedding/__init__.py +0 -0
  32. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  33. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  34. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/__init__.py +0 -0
  35. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  36. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/exceptions.py +0 -0
  37. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  38. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  39. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  40. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  41. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  42. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  43. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  44. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/common.py +0 -0
  56. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/customize_regix_manager.py +0 -0
  57. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/secret_manager.py +0 -0
  58. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_aliyun.py +0 -0
  59. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  60. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_aws.py +0 -0
  61. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  62. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_date.py +0 -0
  63. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_html.py +0 -0
  64. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_kafka.py +0 -0
  65. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_md5.py +0 -0
  66. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_selenium.py +0 -0
  67. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_simple_timer.py +0 -0
  68. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_str.py +0 -0
  69. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_type_mapping.py +0 -0
  70. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit/util/util_url.py +0 -0
  71. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit.egg-info/SOURCES.txt +0 -0
  72. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit.egg-info/dependency_links.txt +0 -0
  73. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit.egg-info/not-zip-safe +0 -0
  74. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit.egg-info/requires.txt +0 -0
  75. {orbitkit-0.8.32 → orbitkit-0.8.34}/orbitkit.egg-info/top_level.txt +0 -0
  76. {orbitkit-0.8.32 → orbitkit-0.8.34}/setup.cfg +0 -0
  77. {orbitkit-0.8.32 → orbitkit-0.8.34}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.32
3
+ Version: 0.8.34
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.34
@@ -13,7 +13,6 @@ from orbitkit.util import s3_split_path, S3Util, get_from_dict_or_env, ExtenCons
13
13
  get_content_type_4_filename
14
14
  from typing import Optional
15
15
  import urllib3
16
- from retry import retry
17
16
  import fitz # PyMuPDF
18
17
  import os
19
18
  from tqdm import tqdm
@@ -46,6 +45,8 @@ class PdfExtractorNetmindFileAnalysis:
46
45
  self.s3_path = s3_path
47
46
  self.slice_option = slice_option
48
47
  self.max_workers = max_workers
48
+ self.input_file_size = None
49
+ self.total_pages = None
49
50
  self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
50
51
  self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
51
52
  self.s3_client = boto3.client('s3',
@@ -67,8 +68,12 @@ class PdfExtractorNetmindFileAnalysis:
67
68
  def split_pdf(self, input_file: str, output_folder: str) -> List[Dict[str, str]]:
68
69
  Path(output_folder).mkdir(parents=True, exist_ok=True)
69
70
  hash_id = id_srv.get_random_short_id()
71
+ # 获取输入文件的大小
72
+ input_file_path = Path(input_file)
73
+ self.input_file_size = input_file_path.stat().st_size # 获取输入文件的大小(字节)
70
74
  pdf_document = fitz.open(input_file)
71
75
  total_pages = len(pdf_document)
76
+ self.total_pages = total_pages
72
77
  pages_per_split = self.slice_option.split_page_number
73
78
  file_path_list = []
74
79
 
@@ -152,7 +157,6 @@ class PdfExtractorNetmindExtract:
152
157
  },
153
158
  ExpiresIn=604800)
154
159
 
155
- @retry(tries=2, delay=4)
156
160
  def get_netmind_response(self, presigned_url):
157
161
  start = time.time()
158
162
  files = {"url": (None, presigned_url)}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.32
3
+ Version: 0.8.34
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.32
File without changes
File without changes
File without changes
File without changes
File without changes