orbitkit 0.8.31__tar.gz → 0.8.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {orbitkit-0.8.31/orbitkit.egg-info → orbitkit-0.8.33}/PKG-INFO +14 -2
  2. orbitkit-0.8.33/orbitkit/VERSION +1 -0
  3. orbitkit-0.8.33/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +444 -0
  4. {orbitkit-0.8.31 → orbitkit-0.8.33/orbitkit.egg-info}/PKG-INFO +14 -2
  5. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit.egg-info/SOURCES.txt +1 -0
  6. orbitkit-0.8.31/orbitkit/VERSION +0 -1
  7. {orbitkit-0.8.31 → orbitkit-0.8.33}/LICENSE +0 -0
  8. {orbitkit-0.8.31 → orbitkit-0.8.33}/MANIFEST.in +0 -0
  9. {orbitkit-0.8.31 → orbitkit-0.8.33}/README.md +0 -0
  10. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/__init__.py +0 -0
  11. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/airflow_handler/__init__.py +0 -0
  12. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  13. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  14. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  15. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/airflow_handler/file_handler.py +0 -0
  16. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  17. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/audio_transcoder/__init__.py +0 -0
  18. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  19. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/constant/__init__.py +0 -0
  20. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/constant/report_schema.py +0 -0
  21. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/id_srv/__init__.py +0 -0
  22. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/id_srv/id_gen.py +0 -0
  23. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/id_srv/id_perm_like.py +0 -0
  24. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/lark_send/__init__.py +0 -0
  25. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/lark_send/lark.py +0 -0
  26. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/llm_tools/__init__.py +0 -0
  27. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  28. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/orbit_type/__init__.py +0 -0
  29. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  30. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  31. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/orbit_type/tools.py +0 -0
  32. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_embedding/__init__.py +0 -0
  33. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  34. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  35. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/__init__.py +0 -0
  36. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  37. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/exceptions.py +0 -0
  38. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  43. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  44. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  45. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  46. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  47. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/base.py +0 -0
  48. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  49. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/core.py +0 -0
  50. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  51. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  52. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  53. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_writer/__init__.py +0 -0
  54. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  55. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/__init__.py +0 -0
  56. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_aliyun.py +0 -0
  60. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  61. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_aws.py +0 -0
  62. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  63. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_date.py +0 -0
  64. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_html.py +0 -0
  65. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_kafka.py +0 -0
  66. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_md5.py +0 -0
  67. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_selenium.py +0 -0
  68. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_simple_timer.py +0 -0
  69. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_str.py +0 -0
  70. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_type_mapping.py +0 -0
  71. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit/util/util_url.py +0 -0
  72. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit.egg-info/dependency_links.txt +0 -0
  73. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit.egg-info/not-zip-safe +0 -0
  74. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit.egg-info/requires.txt +0 -0
  75. {orbitkit-0.8.31 → orbitkit-0.8.33}/orbitkit.egg-info/top_level.txt +0 -0
  76. {orbitkit-0.8.31 → orbitkit-0.8.33}/setup.cfg +0 -0
  77. {orbitkit-0.8.31 → orbitkit-0.8.33}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.31
3
+ Version: 0.8.33
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -28,6 +28,18 @@ Requires-Dist: prettytable>=3.0.0
28
28
  Requires-Dist: pytz>=2022.1
29
29
  Requires-Dist: Deprecated
30
30
  Requires-Dist: func_timeout
31
+ Dynamic: author
32
+ Dynamic: author-email
33
+ Dynamic: classifier
34
+ Dynamic: description
35
+ Dynamic: description-content-type
36
+ Dynamic: home-page
37
+ Dynamic: license
38
+ Dynamic: maintainer
39
+ Dynamic: maintainer-email
40
+ Dynamic: platform
41
+ Dynamic: requires-dist
42
+ Dynamic: summary
31
43
 
32
44
  # orbitkit
33
45
 
@@ -0,0 +1 @@
1
+ 0.8.33
@@ -0,0 +1,444 @@
1
+ from typing import Optional, List, Dict, Any, Coroutine
2
+ from pathlib import Path
3
+ import boto3
4
+ import json
5
+ import tempfile
6
+ import datetime
7
+ import time
8
+ import pytz
9
+ import logging
10
+ import requests
11
+ from orbitkit import id_srv
12
+ from orbitkit.util import s3_split_path, S3Util, get_from_dict_or_env, ExtenCons, s3_path_join, \
13
+ get_content_type_4_filename
14
+ from typing import Optional
15
+ import urllib3
16
+ from retry import retry
17
+ import fitz # PyMuPDF
18
+ import os
19
+ from tqdm import tqdm
20
+ from concurrent.futures import ThreadPoolExecutor, as_completed
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class SplitPageOptions:
25
+ def __init__(self, split_page_number: int = 20, split_size: float = 1 * 1024 * 1024,
26
+ split_threshold: float = 5 * 1024 * 1024):
27
+ self.split_page_number = split_page_number
28
+ self.split_size = split_size
29
+ self.split_threshold = split_threshold
30
+
31
+ def __repr__(self):
32
+ return (f"SplitPageOptions(split_page_number={self.split_page_number}, "
33
+ f"split_size={self.split_size}, "
34
+ f"split_threshold={self.split_threshold})")
35
+
36
+ def needs_split(self, file_size: float) -> bool:
37
+ return file_size > self.split_threshold
38
+
39
+
40
+ class PdfExtractorNetmindFileAnalysis:
41
+ def __init__(self, s3_path: str,
42
+ max_workers: int = 4,
43
+ slice_option: Optional[SplitPageOptions] = SplitPageOptions(), **kwargs):
44
+ self.bucket_tmp = os.getenv('BUCKET_TMP', 'orbit-tmp')
45
+ self.bucket_tmp_group = os.getenv('BUCKET_TMP_GROUP', 'fileflow/')
46
+ self.s3_path = s3_path
47
+ self.slice_option = slice_option
48
+ self.max_workers = max_workers
49
+ self.input_file_size = None
50
+ self.total_pages = None
51
+ self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
52
+ self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
53
+ self.s3_client = boto3.client('s3',
54
+ aws_access_key_id=self.aws_access_key_id,
55
+ aws_secret_access_key=self.aws_secret_access_key)
56
+ self.s3_resource = boto3.resource('s3',
57
+ aws_access_key_id=self.aws_access_key_id,
58
+ aws_secret_access_key=self.aws_secret_access_key)
59
+
60
+ def upload_file_to_s3(self, local_key: str, remote_key: str):
61
+ _remote_key = f'{self.bucket_tmp_group}{remote_key}'
62
+ self.s3_resource.Object(self.bucket_tmp, _remote_key).upload_file(local_key)
63
+ logger.info(f"File {local_key} uploaded to s3://{self.bucket_tmp}/{_remote_key}")
64
+
65
+ def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
66
+ self.s3_resource.Bucket(bucket).download_file(remote_key, local_key)
67
+ logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
68
+
69
+ def split_pdf(self, input_file: str, output_folder: str) -> List[Dict[str, str]]:
70
+ Path(output_folder).mkdir(parents=True, exist_ok=True)
71
+ hash_id = id_srv.get_random_short_id()
72
+ # 获取输入文件的大小
73
+ input_file_path = Path(input_file)
74
+ self.input_file_size = input_file_path.stat().st_size # 获取输入文件的大小(字节)
75
+ pdf_document = fitz.open(input_file)
76
+ total_pages = len(pdf_document)
77
+ self.total_pages = total_pages
78
+ pages_per_split = self.slice_option.split_page_number
79
+ file_path_list = []
80
+
81
+ for start_page in range(0, total_pages, pages_per_split):
82
+ pdf_writer = fitz.open()
83
+ end_page = min(start_page + pages_per_split - 1, total_pages - 1)
84
+ pdf_writer.insert_pdf(pdf_document, from_page=start_page, to_page=end_page)
85
+ remote_name = f'{hash_id}_{start_page // pages_per_split + 1}.pdf'
86
+ output_file = Path(output_folder) / remote_name
87
+ pdf_writer.save(str(output_file))
88
+ pdf_writer.close()
89
+ file_path_list.append({'local_path': str(output_file), "remote_name": remote_name})
90
+
91
+ pdf_document.close()
92
+ return file_path_list
93
+
94
+ def extract(self) -> list[str]:
95
+ s3_path_obj = s3_split_path(self.s3_path)
96
+ with tempfile.TemporaryDirectory() as tmp_dir:
97
+ split_folder = Path(tmp_dir) / 'split'
98
+ file_name = s3_path_obj['store_path'].split('/')[-1]
99
+ file_path = Path(tmp_dir) / file_name
100
+
101
+ self.download_file_from_s3(s3_path_obj['bucket'], s3_path_obj['store_path'], str(file_path))
102
+ logger.info("Downloaded file successfully...")
103
+
104
+ file_path_list = self.split_pdf(str(file_path), str(split_folder))
105
+ # for file in file_path_list:
106
+ # self.upload_file_to_s3(file['local_path'], file['remote_name'])
107
+ # 使用 ThreadPoolExecutor 进行多线程上传
108
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
109
+ future_to_file = {executor.submit(self.upload_file_to_s3, file['local_path'], file['remote_name']): file
110
+ for file in file_path_list}
111
+
112
+ for future in as_completed(future_to_file):
113
+ file = future_to_file[future]
114
+ try:
115
+ future.result() # 获取上传结果
116
+ logger.warning(f"Uploaded {file['remote_name']} successfully.")
117
+ except Exception as e:
118
+ logger.error(f"Error uploading {file['remote_name']}: {e}")
119
+ raise
120
+
121
+ return [f's3://{self.bucket_tmp}/{self.bucket_tmp_group}{i["remote_name"]}' for i in file_path_list]
122
+
123
+
124
+ class PdfExtractorNetmindExtract:
125
+ def __init__(self, s3_path: str, parse_timeout: int = 10 * 60 + 3, **kwargs):
126
+ self.s3_path = s3_path
127
+ self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
128
+ self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
129
+ self.parse_timeout = parse_timeout
130
+ netmind_token = get_from_dict_or_env(
131
+ kwargs, "netmind_token", "NETMIND_TOKEN",
132
+ )
133
+ netmind_service_id = get_from_dict_or_env(
134
+ kwargs, "netmind_service_id", "NETMIND_SERVICE_ID",
135
+ )
136
+ self.netmind_token = netmind_token
137
+ self.netmind_endpoint = f'https://api.netmind.ai/inference-api/v1/inference_service/{netmind_service_id}/api/v1/parse-pdf'
138
+ self.header = {"Authorization": f"Bearer {self.netmind_token}"}
139
+ self.s3_client = boto3.client('s3',
140
+ aws_access_key_id=self.aws_access_key_id,
141
+ aws_secret_access_key=self.aws_secret_access_key)
142
+ self.s3_resource = boto3.resource('s3',
143
+ aws_access_key_id=self.aws_access_key_id,
144
+ aws_secret_access_key=self.aws_secret_access_key)
145
+
146
+ def netmind_api(self):
147
+ s3_path_obj = s3_split_path(self.s3_path)
148
+ presigned_url = self._generate_presigned_url(s3_path_obj)
149
+ logger.warning("Get presigned_url successfully...")
150
+ json_response = self.get_netmind_response(presigned_url)
151
+ return self._save_json_to_s3(json_response, s3_path_obj)
152
+
153
+ def _generate_presigned_url(self, s3_path_obj):
154
+ return self.s3_client.generate_presigned_url('get_object',
155
+ Params={
156
+ 'Bucket': s3_path_obj["bucket"],
157
+ 'Key': s3_path_obj["store_path"]
158
+ },
159
+ ExpiresIn=604800)
160
+
161
+ @retry(tries=2, delay=4)
162
+ def get_netmind_response(self, presigned_url):
163
+ start = time.time()
164
+ files = {"url": (None, presigned_url)}
165
+ response = requests.post(
166
+ self.netmind_endpoint, files=files, timeout=self.parse_timeout, headers=self.header,
167
+ )
168
+ # 状态检查
169
+ response.raise_for_status()
170
+ logger.info(f"Extract text by using Netmind successfully: {time.time() - start}")
171
+ return response.json()
172
+
173
+ def _save_json_to_s3(self, json_data, s3_path_obj):
174
+ json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
175
+ local_name = json_key.split('/')[-1]
176
+ json_content = json.dumps(json_data) # 转换为 JSON 字符串
177
+ with tempfile.TemporaryDirectory() as tmp_dir:
178
+ input_folder = os.path.join(tmp_dir, 'input')
179
+ if not os.path.exists(input_folder):
180
+ os.makedirs(input_folder)
181
+ local_key = os.path.join(input_folder, local_name) # 临时文件路径
182
+ # 将 JSON 数据写入临时文件
183
+ with open(local_key, 'w') as json_file:
184
+ json_file.write(json_content)
185
+ # 上传到 S3
186
+ self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
187
+ return f"s3://{s3_path_obj['bucket']}/{json_key}"
188
+
189
+ def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
190
+ self.s3_resource.Object(bucket, remote_key).upload_file(local_key)
191
+ logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
192
+
193
+
194
+
195
+ class PdfExtractorNetmindMerge:
196
+ def __init__(self,
197
+ source_s3_path: str = None,
198
+ temp_folder: Optional[str] = None,
199
+ s3_util: Optional[S3Util] = None,
200
+ txt_vector: str = 'txt-vector',
201
+ slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
202
+ **kwargs):
203
+ self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
204
+ self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
205
+ self.temp_folder = temp_folder
206
+ self.txt_vector = txt_vector
207
+ self.slice_option = slice_option
208
+ self.source_s3_path = source_s3_path
209
+ if not source_s3_path:
210
+ raise Exception('not params source source_s3_path')
211
+ if s3_util:
212
+ self.s3_util = s3_util
213
+ else:
214
+ # Try to get key aws pair
215
+ aws_access_key_id = get_from_dict_or_env(
216
+ kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID",
217
+ )
218
+
219
+ aws_secret_access_key = get_from_dict_or_env(
220
+ kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY",
221
+ )
222
+ self.s3_util = S3Util(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
223
+ self._s3_resource = self.s3_util.get_s3_resource()
224
+ self._s3_client = self.s3_util.get_s3_client()
225
+
226
+
227
+ def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
228
+ self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
229
+ logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
230
+
231
+
232
+ def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
233
+ self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
234
+ logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
235
+
236
+
237
+ def megre_json(self, json_s3_path_list):
238
+ with tempfile.TemporaryDirectory() as tmp_dir:
239
+ # page 字典
240
+ meta_data = {}
241
+ for _s3_path in json_s3_path_list:
242
+ if '.json' not in _s3_path:
243
+ raise Exception('s3 file path type error')
244
+ obj = s3_split_path(_s3_path)
245
+ local_path = os.path.join(tmp_dir, obj['store_path'].split('/')[-1])
246
+ self.download_file_from_s3(obj['bucket'], obj['store_path'], local_path)
247
+ # 下载s3文件,读取json
248
+ with open(local_path, 'r') as file:
249
+ _split_response_json = json.load(file)
250
+ file_item_name = _s3_path.split('/')[-1].replace('.json', '')
251
+ start_page = (int(file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
252
+ meta_data[start_page] = _split_response_json
253
+ logger.warning("[JSON] Down json result successfully...")
254
+ # 合并
255
+ file_arr = []
256
+ for key, value in meta_data.items():
257
+ logger.info(f"{key} merge {'+' * 3}")
258
+ # 更改page
259
+ change_page_data = [{**item, 'page': item['page'] + int(key)} for item in value]
260
+ file_arr.extend(change_page_data)
261
+ # 根据 page 和 seq_no 排序
262
+ sorted_file_arr = sorted(file_arr, key=lambda x: (x['page'], x['seq_no']))
263
+ # 检查页码连续性
264
+ if sorted_file_arr:
265
+ # 收集所有页码并去重排序
266
+ all_pages = sorted(set(item['page'] for item in sorted_file_arr))
267
+
268
+ # 检查页码是否连续
269
+ for i in range(1, len(all_pages)):
270
+ if all_pages[i] != all_pages[i - 1] + 1:
271
+ missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
272
+ raise ValueError(
273
+ f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
274
+ f"缺少页码: {missing_pages}"
275
+ )
276
+ logger.info("[JSON] Merge json result successfully...")
277
+ return sorted_file_arr
278
+
279
+ def extract(self, json_s3_path_list):
280
+ response_json = self.megre_json(json_s3_path_list)
281
+ if self.temp_folder:
282
+ if not os.path.exists(self.temp_folder):
283
+ raise Exception('The temp folder given not exists...')
284
+ self.extract_detail(self.temp_folder, response_json)
285
+ else:
286
+ with tempfile.TemporaryDirectory() as tmp_dir:
287
+ input_folder = os.path.join(tmp_dir, 'input')
288
+ if not os.path.exists(input_folder):
289
+ os.makedirs(input_folder)
290
+
291
+ self.extract_detail(input_folder, response_json)
292
+
293
+ def extract_detail(self, input_folder, response_json):
294
+ s3_path_obj = s3_split_path(self.source_s3_path)
295
+ # 禁用 InsecureRequestWarning
296
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
297
+ # >>>>>>>>>>>>>>>>>> Start to generate pages/blocks/raw
298
+ netmind_reg_file = os.path.join(input_folder, f'netmind_reg.txt')
299
+ blocks_file = os.path.join(input_folder, f'blocks.txt')
300
+ pages_file = os.path.join(input_folder, f'pages.txt')
301
+
302
+ with open(netmind_reg_file, "w+", encoding='utf-8') as f_reg, \
303
+ open(blocks_file, "w+", encoding='utf-8') as f_blocks, \
304
+ open(pages_file, "w+", encoding='utf-8') as f_pages:
305
+
306
+ current_page_num = 0
307
+ current_page_txt = ""
308
+ all_local_image_path = []
309
+ for ind, block in enumerate(response_json, start=1):
310
+ # 如果是图片类型,则进行图片的转换
311
+ if block["type"] in ["image"]:
312
+ for item_img in block["image_detail"]:
313
+ # res_img = requests.get(item_img["path"], verify=False)
314
+ # if res_img.status_code != 200:
315
+ # raise Exception(f"Download image failed: {res_img.status_code}")
316
+
317
+ # Build image local path for using
318
+ image_extension = str(item_img["path"]).split(".")[-1]
319
+ image_local_relitive_path = f"{id_srv.get_random_short_id_v2()}.{image_extension}"
320
+ # image_local_abs_path = os.path.join(input_folder, image_local_relitive_path)
321
+ # with open(image_local_abs_path, mode="wb") as img_tmp:
322
+ # img_tmp.write(res_img.content)
323
+ # all_local_image_path.append(image_local_relitive_path)
324
+ all_local_image_path.append({
325
+ "path_raw": item_img["path"],
326
+ "path_s3": f"images/{image_local_relitive_path}",
327
+ })
328
+ # 直接修改其 image path 为自己的
329
+ item_img["path"] = f"images/{image_local_relitive_path}"
330
+
331
+ # For raw
332
+ f_reg.write(json.dumps(block, ensure_ascii=False) + "\n")
333
+
334
+ # For blocks
335
+ f_blocks.write(json.dumps(self.convert_2_block(block), ensure_ascii=False) + "\n")
336
+
337
+ # For page: image Not in combine list
338
+ if block["type"] in ["image"]:
339
+ continue
340
+ if "sentence" not in block:
341
+ continue
342
+ if block["page"] != current_page_num:
343
+ if current_page_num != 0:
344
+ # 开辟新的页并且将之前的页写入到文件中
345
+ f_pages.write(json.dumps({
346
+ "id": id_srv.get_random_short_id(),
347
+ "page": current_page_num,
348
+ "sentence": current_page_txt,
349
+ }, ensure_ascii=False) + "\n")
350
+
351
+ # 重置为下一页做准备
352
+ current_page_num = block["page"]
353
+ current_page_txt = block["sentence"] + "\n\n"
354
+ else:
355
+ # 说明在同一页
356
+ current_page_txt += block["sentence"] + "\n\n"
357
+
358
+ # 最后一页的话,最后加入
359
+ if ind == len(response_json):
360
+ f_pages.write(json.dumps({
361
+ "id": id_srv.get_random_short_id(),
362
+ "page": current_page_num,
363
+ "sentence": current_page_txt,
364
+ }, ensure_ascii=False) + "\n")
365
+
366
+ logger.info(f"Write [blocks.txt] and [pages.txt] successfully...")
367
+
368
+ # 上传各种文件到 s3 ---------------------------------------------------------------------------------------
369
+ # Update images to s3
370
+ for item_img in tqdm(all_local_image_path):
371
+ with requests.get(item_img["path_raw"], stream=True) as response:
372
+ response.raise_for_status() # 检查是否成功
373
+ self._s3_client.upload_fileobj(
374
+ response.raw,
375
+ s3_path_obj['bucket'],
376
+ s3_path_join(self.txt_vector, s3_path_obj['store_path'], item_img["path_s3"]),
377
+ ExtraArgs={'ContentType': get_content_type_4_filename(item_img["path_raw"])},
378
+ )
379
+
380
+ # for item_img in all_local_image_path:
381
+ # self._s3_client.upload_file(
382
+ # os.path.join(input_folder, item_img),
383
+ # s3_path_obj['bucket'], s3_path_join(self.txt_vector, s3_path_obj['store_path'], 'images', item_img),
384
+ # ExtraArgs={'ContentType': get_content_type_4_filename(item_img)}
385
+ # )
386
+ logger.warning("[image] Store images result successfully...")
387
+
388
+ # Upload raw files to s3
389
+ self._s3_client.upload_file(
390
+ netmind_reg_file,
391
+ s3_path_obj['bucket'], s3_path_join(self.txt_vector, s3_path_obj['store_path'], f'netmind_reg.txt'),
392
+ ExtraArgs={'ContentType': ExtenCons.EXTEN_TEXT_TXT_UTF8.value}
393
+ )
394
+ logger.info("[raw] Store raw result successfully...")
395
+
396
+ # Upload pages files to s3
397
+ pages_txt_key = s3_path_join(self.txt_vector, s3_path_obj['store_path'], f'pages.txt')
398
+ self._s3_client.upload_file(os.path.join(input_folder, 'pages.txt'), s3_path_obj['bucket'], pages_txt_key,
399
+ ExtraArgs={'ContentType': ExtenCons.EXTEN_TEXT_TXT_UTF8.value})
400
+ if self.s3_util.check_file_exist(s3_path_obj["bucket"], pages_txt_key) is False:
401
+ raise Exception("[page] Store page result failed...")
402
+ logger.info("[page] Store page result successfully...")
403
+
404
+ # Upload blocks files to s3
405
+ blocks_txt_key = s3_path_join(self.txt_vector, s3_path_obj['store_path'], f'blocks.txt')
406
+ self._s3_client.upload_file(os.path.join(input_folder, f'blocks.txt'), s3_path_obj['bucket'], blocks_txt_key,
407
+ ExtraArgs={'ContentType': ExtenCons.EXTEN_TEXT_TXT_UTF8.value})
408
+ if self.s3_util.check_file_exist(s3_path_obj["bucket"], blocks_txt_key) is False:
409
+ raise Exception("[block] Store block result failed...")
410
+ logger.info("[block] Store block result successfully...")
411
+
412
+ extract_meta = {
413
+ "extraction": {
414
+ "version": "netmind",
415
+ "sub_version": "v1",
416
+ "finished_time": datetime.datetime.now(tz=pytz.timezone('UTC')).strftime('%Y-%m-%dT%H:%M:%S%z')
417
+ },
418
+ "metadata": {},
419
+ "others": {}
420
+ }
421
+
422
+ object_put = self._s3_resource.Object(s3_path_obj['bucket'],
423
+ s3_path_join(self.txt_vector, s3_path_obj['store_path'],
424
+ f'metadata.txt'))
425
+ object_put.put(Body=json.dumps(extract_meta, ensure_ascii=False),
426
+ ContentType=ExtenCons.EXTEN_TEXT_TXT_UTF8.value)
427
+ logger.info("[meta] Store extract meta info successfully...")
428
+
429
+ def convert_2_block(self, block_raw):
430
+ f_block = {
431
+ "id": block_raw["id"],
432
+ "page": block_raw["page"],
433
+ "seq_no": block_raw["seq_no"],
434
+ "sentence": "",
435
+ "type": block_raw["type"],
436
+ "image_detail": [],
437
+ "text_location": {"location": block_raw["text_location"]["location"]},
438
+ }
439
+ if "sentence" in block_raw:
440
+ f_block["sentence"] = block_raw["sentence"]
441
+
442
+ if block_raw["type"] == "image":
443
+ f_block["image_detail"] = block_raw["image_detail"]
444
+ return f_block
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: orbitkit
3
- Version: 0.8.31
3
+ Version: 0.8.33
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -28,6 +28,18 @@ Requires-Dist: prettytable>=3.0.0
28
28
  Requires-Dist: pytz>=2022.1
29
29
  Requires-Dist: Deprecated
30
30
  Requires-Dist: func_timeout
31
+ Dynamic: author
32
+ Dynamic: author-email
33
+ Dynamic: classifier
34
+ Dynamic: description
35
+ Dynamic: description-content-type
36
+ Dynamic: home-page
37
+ Dynamic: license
38
+ Dynamic: maintainer
39
+ Dynamic: maintainer-email
40
+ Dynamic: platform
41
+ Dynamic: requires-dist
42
+ Dynamic: summary
31
43
 
32
44
  # orbitkit
33
45
 
@@ -44,6 +44,7 @@ orbitkit/pdf_extractor/pdf_extractor_azure.py
44
44
  orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py
45
45
  orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py
46
46
  orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py
47
+ orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py
47
48
  orbitkit/pdf_extractor/pdf_extractor_orbit.py
48
49
  orbitkit/pdf_extractor_simple/__init__.py
49
50
  orbitkit/pdf_extractor_simple/base.py
@@ -1 +0,0 @@
1
- 0.8.31
File without changes
File without changes
File without changes
File without changes
File without changes