orbitkit 0.8.31__tar.gz → 0.8.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.31/orbitkit.egg-info → orbitkit-0.8.32}/PKG-INFO +14 -2
- orbitkit-0.8.32/orbitkit/VERSION +1 -0
- orbitkit-0.8.32/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +438 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32/orbitkit.egg-info}/PKG-INFO +14 -2
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit.egg-info/SOURCES.txt +1 -0
- orbitkit-0.8.31/orbitkit/VERSION +0 -1
- {orbitkit-0.8.31 → orbitkit-0.8.32}/LICENSE +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/MANIFEST.in +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/README.md +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/setup.cfg +0 -0
- {orbitkit-0.8.31 → orbitkit-0.8.32}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.32
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -28,6 +28,18 @@ Requires-Dist: prettytable>=3.0.0
|
|
|
28
28
|
Requires-Dist: pytz>=2022.1
|
|
29
29
|
Requires-Dist: Deprecated
|
|
30
30
|
Requires-Dist: func_timeout
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: author-email
|
|
33
|
+
Dynamic: classifier
|
|
34
|
+
Dynamic: description
|
|
35
|
+
Dynamic: description-content-type
|
|
36
|
+
Dynamic: home-page
|
|
37
|
+
Dynamic: license
|
|
38
|
+
Dynamic: maintainer
|
|
39
|
+
Dynamic: maintainer-email
|
|
40
|
+
Dynamic: platform
|
|
41
|
+
Dynamic: requires-dist
|
|
42
|
+
Dynamic: summary
|
|
31
43
|
|
|
32
44
|
# orbitkit
|
|
33
45
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.32
|
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
from typing import Optional, List, Dict, Any, Coroutine
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import boto3
|
|
4
|
+
import json
|
|
5
|
+
import tempfile
|
|
6
|
+
import datetime
|
|
7
|
+
import time
|
|
8
|
+
import pytz
|
|
9
|
+
import logging
|
|
10
|
+
import requests
|
|
11
|
+
from orbitkit import id_srv
|
|
12
|
+
from orbitkit.util import s3_split_path, S3Util, get_from_dict_or_env, ExtenCons, s3_path_join, \
|
|
13
|
+
get_content_type_4_filename
|
|
14
|
+
from typing import Optional
|
|
15
|
+
import urllib3
|
|
16
|
+
from retry import retry
|
|
17
|
+
import fitz # PyMuPDF
|
|
18
|
+
import os
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SplitPageOptions:
|
|
25
|
+
def __init__(self, split_page_number: int = 20, split_size: float = 1 * 1024 * 1024,
|
|
26
|
+
split_threshold: float = 5 * 1024 * 1024):
|
|
27
|
+
self.split_page_number = split_page_number
|
|
28
|
+
self.split_size = split_size
|
|
29
|
+
self.split_threshold = split_threshold
|
|
30
|
+
|
|
31
|
+
def __repr__(self):
|
|
32
|
+
return (f"SplitPageOptions(split_page_number={self.split_page_number}, "
|
|
33
|
+
f"split_size={self.split_size}, "
|
|
34
|
+
f"split_threshold={self.split_threshold})")
|
|
35
|
+
|
|
36
|
+
def needs_split(self, file_size: float) -> bool:
|
|
37
|
+
return file_size > self.split_threshold
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PdfExtractorNetmindFileAnalysis:
|
|
41
|
+
def __init__(self, s3_path: str,
|
|
42
|
+
max_workers: int = 4,
|
|
43
|
+
slice_option: Optional[SplitPageOptions] = SplitPageOptions(), **kwargs):
|
|
44
|
+
self.bucket_tmp = os.getenv('BUCKET_TMP', 'orbit-tmp')
|
|
45
|
+
self.bucket_tmp_group = os.getenv('BUCKET_TMP_GROUP', 'fileflow/')
|
|
46
|
+
self.s3_path = s3_path
|
|
47
|
+
self.slice_option = slice_option
|
|
48
|
+
self.max_workers = max_workers
|
|
49
|
+
self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
|
|
50
|
+
self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
|
|
51
|
+
self.s3_client = boto3.client('s3',
|
|
52
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
53
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
54
|
+
self.s3_resource = boto3.resource('s3',
|
|
55
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
56
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
57
|
+
|
|
58
|
+
def upload_file_to_s3(self, local_key: str, remote_key: str):
|
|
59
|
+
_remote_key = f'{self.bucket_tmp_group}{remote_key}'
|
|
60
|
+
self.s3_resource.Object(self.bucket_tmp, _remote_key).upload_file(local_key)
|
|
61
|
+
logger.info(f"File {local_key} uploaded to s3://{self.bucket_tmp}/{_remote_key}")
|
|
62
|
+
|
|
63
|
+
def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
|
|
64
|
+
self.s3_resource.Bucket(bucket).download_file(remote_key, local_key)
|
|
65
|
+
logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
|
|
66
|
+
|
|
67
|
+
def split_pdf(self, input_file: str, output_folder: str) -> List[Dict[str, str]]:
|
|
68
|
+
Path(output_folder).mkdir(parents=True, exist_ok=True)
|
|
69
|
+
hash_id = id_srv.get_random_short_id()
|
|
70
|
+
pdf_document = fitz.open(input_file)
|
|
71
|
+
total_pages = len(pdf_document)
|
|
72
|
+
pages_per_split = self.slice_option.split_page_number
|
|
73
|
+
file_path_list = []
|
|
74
|
+
|
|
75
|
+
for start_page in range(0, total_pages, pages_per_split):
|
|
76
|
+
pdf_writer = fitz.open()
|
|
77
|
+
end_page = min(start_page + pages_per_split - 1, total_pages - 1)
|
|
78
|
+
pdf_writer.insert_pdf(pdf_document, from_page=start_page, to_page=end_page)
|
|
79
|
+
remote_name = f'{hash_id}_{start_page // pages_per_split + 1}.pdf'
|
|
80
|
+
output_file = Path(output_folder) / remote_name
|
|
81
|
+
pdf_writer.save(str(output_file))
|
|
82
|
+
pdf_writer.close()
|
|
83
|
+
file_path_list.append({'local_path': str(output_file), "remote_name": remote_name})
|
|
84
|
+
|
|
85
|
+
pdf_document.close()
|
|
86
|
+
return file_path_list
|
|
87
|
+
|
|
88
|
+
def extract(self) -> list[str]:
|
|
89
|
+
s3_path_obj = s3_split_path(self.s3_path)
|
|
90
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
91
|
+
split_folder = Path(tmp_dir) / 'split'
|
|
92
|
+
file_name = s3_path_obj['store_path'].split('/')[-1]
|
|
93
|
+
file_path = Path(tmp_dir) / file_name
|
|
94
|
+
|
|
95
|
+
self.download_file_from_s3(s3_path_obj['bucket'], s3_path_obj['store_path'], str(file_path))
|
|
96
|
+
logger.info("Downloaded file successfully...")
|
|
97
|
+
|
|
98
|
+
file_path_list = self.split_pdf(str(file_path), str(split_folder))
|
|
99
|
+
# for file in file_path_list:
|
|
100
|
+
# self.upload_file_to_s3(file['local_path'], file['remote_name'])
|
|
101
|
+
# 使用 ThreadPoolExecutor 进行多线程上传
|
|
102
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
103
|
+
future_to_file = {executor.submit(self.upload_file_to_s3, file['local_path'], file['remote_name']): file
|
|
104
|
+
for file in file_path_list}
|
|
105
|
+
|
|
106
|
+
for future in as_completed(future_to_file):
|
|
107
|
+
file = future_to_file[future]
|
|
108
|
+
try:
|
|
109
|
+
future.result() # 获取上传结果
|
|
110
|
+
logger.warning(f"Uploaded {file['remote_name']} successfully.")
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error(f"Error uploading {file['remote_name']}: {e}")
|
|
113
|
+
raise
|
|
114
|
+
|
|
115
|
+
return [f's3://{self.bucket_tmp}/{self.bucket_tmp_group}{i["remote_name"]}' for i in file_path_list]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class PdfExtractorNetmindExtract:
|
|
119
|
+
def __init__(self, s3_path: str, parse_timeout: int = 10 * 60 + 3, **kwargs):
|
|
120
|
+
self.s3_path = s3_path
|
|
121
|
+
self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
|
|
122
|
+
self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
|
|
123
|
+
self.parse_timeout = parse_timeout
|
|
124
|
+
netmind_token = get_from_dict_or_env(
|
|
125
|
+
kwargs, "netmind_token", "NETMIND_TOKEN",
|
|
126
|
+
)
|
|
127
|
+
netmind_service_id = get_from_dict_or_env(
|
|
128
|
+
kwargs, "netmind_service_id", "NETMIND_SERVICE_ID",
|
|
129
|
+
)
|
|
130
|
+
self.netmind_token = netmind_token
|
|
131
|
+
self.netmind_endpoint = f'https://api.netmind.ai/inference-api/v1/inference_service/{netmind_service_id}/api/v1/parse-pdf'
|
|
132
|
+
self.header = {"Authorization": f"Bearer {self.netmind_token}"}
|
|
133
|
+
self.s3_client = boto3.client('s3',
|
|
134
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
135
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
136
|
+
self.s3_resource = boto3.resource('s3',
|
|
137
|
+
aws_access_key_id=self.aws_access_key_id,
|
|
138
|
+
aws_secret_access_key=self.aws_secret_access_key)
|
|
139
|
+
|
|
140
|
+
def netmind_api(self):
|
|
141
|
+
s3_path_obj = s3_split_path(self.s3_path)
|
|
142
|
+
presigned_url = self._generate_presigned_url(s3_path_obj)
|
|
143
|
+
logger.warning("Get presigned_url successfully...")
|
|
144
|
+
json_response = self.get_netmind_response(presigned_url)
|
|
145
|
+
return self._save_json_to_s3(json_response, s3_path_obj)
|
|
146
|
+
|
|
147
|
+
def _generate_presigned_url(self, s3_path_obj):
|
|
148
|
+
return self.s3_client.generate_presigned_url('get_object',
|
|
149
|
+
Params={
|
|
150
|
+
'Bucket': s3_path_obj["bucket"],
|
|
151
|
+
'Key': s3_path_obj["store_path"]
|
|
152
|
+
},
|
|
153
|
+
ExpiresIn=604800)
|
|
154
|
+
|
|
155
|
+
@retry(tries=2, delay=4)
|
|
156
|
+
def get_netmind_response(self, presigned_url):
|
|
157
|
+
start = time.time()
|
|
158
|
+
files = {"url": (None, presigned_url)}
|
|
159
|
+
response = requests.post(
|
|
160
|
+
self.netmind_endpoint, files=files, timeout=self.parse_timeout, headers=self.header,
|
|
161
|
+
)
|
|
162
|
+
# 状态检查
|
|
163
|
+
response.raise_for_status()
|
|
164
|
+
logger.info(f"Extract text by using Netmind successfully: {time.time() - start}")
|
|
165
|
+
return response.json()
|
|
166
|
+
|
|
167
|
+
def _save_json_to_s3(self, json_data, s3_path_obj):
|
|
168
|
+
json_key = f"{s3_path_obj['store_path']}.json" # 生成 JSON 文件名
|
|
169
|
+
local_name = json_key.split('/')[-1]
|
|
170
|
+
json_content = json.dumps(json_data) # 转换为 JSON 字符串
|
|
171
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
172
|
+
input_folder = os.path.join(tmp_dir, 'input')
|
|
173
|
+
if not os.path.exists(input_folder):
|
|
174
|
+
os.makedirs(input_folder)
|
|
175
|
+
local_key = os.path.join(input_folder, local_name) # 临时文件路径
|
|
176
|
+
# 将 JSON 数据写入临时文件
|
|
177
|
+
with open(local_key, 'w') as json_file:
|
|
178
|
+
json_file.write(json_content)
|
|
179
|
+
# 上传到 S3
|
|
180
|
+
self.upload_file_to_s3(s3_path_obj['bucket'],local_key, json_key)
|
|
181
|
+
return f"s3://{s3_path_obj['bucket']}/{json_key}"
|
|
182
|
+
|
|
183
|
+
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
184
|
+
self.s3_resource.Object(bucket, remote_key).upload_file(local_key)
|
|
185
|
+
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class PdfExtractorNetmindMerge:
|
|
190
|
+
def __init__(self,
|
|
191
|
+
source_s3_path: str = None,
|
|
192
|
+
temp_folder: Optional[str] = None,
|
|
193
|
+
s3_util: Optional[S3Util] = None,
|
|
194
|
+
txt_vector: str = 'txt-vector',
|
|
195
|
+
slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
|
|
196
|
+
**kwargs):
|
|
197
|
+
self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
|
|
198
|
+
self.aws_secret_access_key = get_from_dict_or_env(kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY")
|
|
199
|
+
self.temp_folder = temp_folder
|
|
200
|
+
self.txt_vector = txt_vector
|
|
201
|
+
self.slice_option = slice_option
|
|
202
|
+
self.source_s3_path = source_s3_path
|
|
203
|
+
if not source_s3_path:
|
|
204
|
+
raise Exception('not params source source_s3_path')
|
|
205
|
+
if s3_util:
|
|
206
|
+
self.s3_util = s3_util
|
|
207
|
+
else:
|
|
208
|
+
# Try to get key aws pair
|
|
209
|
+
aws_access_key_id = get_from_dict_or_env(
|
|
210
|
+
kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
aws_secret_access_key = get_from_dict_or_env(
|
|
214
|
+
kwargs, "aws_secret_access_key", "AWS_SECRET_ACCESS_KEY",
|
|
215
|
+
)
|
|
216
|
+
self.s3_util = S3Util(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
|
217
|
+
self._s3_resource = self.s3_util.get_s3_resource()
|
|
218
|
+
self._s3_client = self.s3_util.get_s3_client()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def upload_file_to_s3(self, bucket, local_key: str, remote_key: str):
|
|
222
|
+
self._s3_resource.Object(bucket, remote_key).upload_file(local_key)
|
|
223
|
+
logger.warning(f"File {local_key} Uploaded To s3://{bucket}/{remote_key}")
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def download_file_from_s3(self, bucket: str, remote_key: str, local_key: str):
|
|
227
|
+
self._s3_resource.Bucket(bucket).download_file(remote_key, local_key)
|
|
228
|
+
logger.info(f"File s3://{bucket}/{remote_key} downloaded to {local_key}")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def megre_json(self, json_s3_path_list):
|
|
232
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
233
|
+
# page 字典
|
|
234
|
+
meta_data = {}
|
|
235
|
+
for _s3_path in json_s3_path_list:
|
|
236
|
+
if '.json' not in _s3_path:
|
|
237
|
+
raise Exception('s3 file path type error')
|
|
238
|
+
obj = s3_split_path(_s3_path)
|
|
239
|
+
local_path = os.path.join(tmp_dir, obj['store_path'].split('/')[-1])
|
|
240
|
+
self.download_file_from_s3(obj['bucket'], obj['store_path'], local_path)
|
|
241
|
+
# 下载s3文件,读取json
|
|
242
|
+
with open(local_path, 'r') as file:
|
|
243
|
+
_split_response_json = json.load(file)
|
|
244
|
+
file_item_name = _s3_path.split('/')[-1].replace('.json', '')
|
|
245
|
+
start_page = (int(file_item_name.split('_')[-1].split('.')[0]) - 1) * self.slice_option.split_page_number
|
|
246
|
+
meta_data[start_page] = _split_response_json
|
|
247
|
+
logger.warning("[JSON] Down json result successfully...")
|
|
248
|
+
# 合并
|
|
249
|
+
file_arr = []
|
|
250
|
+
for key, value in meta_data.items():
|
|
251
|
+
logger.info(f"{key} merge {'+' * 3}")
|
|
252
|
+
# 更改page
|
|
253
|
+
change_page_data = [{**item, 'page': item['page'] + int(key)} for item in value]
|
|
254
|
+
file_arr.extend(change_page_data)
|
|
255
|
+
# 根据 page 和 seq_no 排序
|
|
256
|
+
sorted_file_arr = sorted(file_arr, key=lambda x: (x['page'], x['seq_no']))
|
|
257
|
+
# 检查页码连续性
|
|
258
|
+
if sorted_file_arr:
|
|
259
|
+
# 收集所有页码并去重排序
|
|
260
|
+
all_pages = sorted(set(item['page'] for item in sorted_file_arr))
|
|
261
|
+
|
|
262
|
+
# 检查页码是否连续
|
|
263
|
+
for i in range(1, len(all_pages)):
|
|
264
|
+
if all_pages[i] != all_pages[i - 1] + 1:
|
|
265
|
+
missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
|
|
266
|
+
raise ValueError(
|
|
267
|
+
f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
|
|
268
|
+
f"缺少页码: {missing_pages}"
|
|
269
|
+
)
|
|
270
|
+
logger.info("[JSON] Merge json result successfully...")
|
|
271
|
+
return sorted_file_arr
|
|
272
|
+
|
|
273
|
+
def extract(self, json_s3_path_list):
|
|
274
|
+
response_json = self.megre_json(json_s3_path_list)
|
|
275
|
+
if self.temp_folder:
|
|
276
|
+
if not os.path.exists(self.temp_folder):
|
|
277
|
+
raise Exception('The temp folder given not exists...')
|
|
278
|
+
self.extract_detail(self.temp_folder, response_json)
|
|
279
|
+
else:
|
|
280
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
281
|
+
input_folder = os.path.join(tmp_dir, 'input')
|
|
282
|
+
if not os.path.exists(input_folder):
|
|
283
|
+
os.makedirs(input_folder)
|
|
284
|
+
|
|
285
|
+
self.extract_detail(input_folder, response_json)
|
|
286
|
+
|
|
287
|
+
def extract_detail(self, input_folder, response_json):
|
|
288
|
+
s3_path_obj = s3_split_path(self.source_s3_path)
|
|
289
|
+
# 禁用 InsecureRequestWarning
|
|
290
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
291
|
+
# >>>>>>>>>>>>>>>>>> Start to generate pages/blocks/raw
|
|
292
|
+
netmind_reg_file = os.path.join(input_folder, f'netmind_reg.txt')
|
|
293
|
+
blocks_file = os.path.join(input_folder, f'blocks.txt')
|
|
294
|
+
pages_file = os.path.join(input_folder, f'pages.txt')
|
|
295
|
+
|
|
296
|
+
with open(netmind_reg_file, "w+", encoding='utf-8') as f_reg, \
|
|
297
|
+
open(blocks_file, "w+", encoding='utf-8') as f_blocks, \
|
|
298
|
+
open(pages_file, "w+", encoding='utf-8') as f_pages:
|
|
299
|
+
|
|
300
|
+
current_page_num = 0
|
|
301
|
+
current_page_txt = ""
|
|
302
|
+
all_local_image_path = []
|
|
303
|
+
for ind, block in enumerate(response_json, start=1):
|
|
304
|
+
# 如果是图片类型,则进行图片的转换
|
|
305
|
+
if block["type"] in ["image"]:
|
|
306
|
+
for item_img in block["image_detail"]:
|
|
307
|
+
# res_img = requests.get(item_img["path"], verify=False)
|
|
308
|
+
# if res_img.status_code != 200:
|
|
309
|
+
# raise Exception(f"Download image failed: {res_img.status_code}")
|
|
310
|
+
|
|
311
|
+
# Build image local path for using
|
|
312
|
+
image_extension = str(item_img["path"]).split(".")[-1]
|
|
313
|
+
image_local_relitive_path = f"{id_srv.get_random_short_id_v2()}.{image_extension}"
|
|
314
|
+
# image_local_abs_path = os.path.join(input_folder, image_local_relitive_path)
|
|
315
|
+
# with open(image_local_abs_path, mode="wb") as img_tmp:
|
|
316
|
+
# img_tmp.write(res_img.content)
|
|
317
|
+
# all_local_image_path.append(image_local_relitive_path)
|
|
318
|
+
all_local_image_path.append({
|
|
319
|
+
"path_raw": item_img["path"],
|
|
320
|
+
"path_s3": f"images/{image_local_relitive_path}",
|
|
321
|
+
})
|
|
322
|
+
# 直接修改其 image path 为自己的
|
|
323
|
+
item_img["path"] = f"images/{image_local_relitive_path}"
|
|
324
|
+
|
|
325
|
+
# For raw
|
|
326
|
+
f_reg.write(json.dumps(block, ensure_ascii=False) + "\n")
|
|
327
|
+
|
|
328
|
+
# For blocks
|
|
329
|
+
f_blocks.write(json.dumps(self.convert_2_block(block), ensure_ascii=False) + "\n")
|
|
330
|
+
|
|
331
|
+
# For page: image Not in combine list
|
|
332
|
+
if block["type"] in ["image"]:
|
|
333
|
+
continue
|
|
334
|
+
if "sentence" not in block:
|
|
335
|
+
continue
|
|
336
|
+
if block["page"] != current_page_num:
|
|
337
|
+
if current_page_num != 0:
|
|
338
|
+
# 开辟新的页并且将之前的页写入到文件中
|
|
339
|
+
f_pages.write(json.dumps({
|
|
340
|
+
"id": id_srv.get_random_short_id(),
|
|
341
|
+
"page": current_page_num,
|
|
342
|
+
"sentence": current_page_txt,
|
|
343
|
+
}, ensure_ascii=False) + "\n")
|
|
344
|
+
|
|
345
|
+
# 重置为下一页做准备
|
|
346
|
+
current_page_num = block["page"]
|
|
347
|
+
current_page_txt = block["sentence"] + "\n\n"
|
|
348
|
+
else:
|
|
349
|
+
# 说明在同一页
|
|
350
|
+
current_page_txt += block["sentence"] + "\n\n"
|
|
351
|
+
|
|
352
|
+
# 最后一页的话,最后加入
|
|
353
|
+
if ind == len(response_json):
|
|
354
|
+
f_pages.write(json.dumps({
|
|
355
|
+
"id": id_srv.get_random_short_id(),
|
|
356
|
+
"page": current_page_num,
|
|
357
|
+
"sentence": current_page_txt,
|
|
358
|
+
}, ensure_ascii=False) + "\n")
|
|
359
|
+
|
|
360
|
+
logger.info(f"Write [blocks.txt] and [pages.txt] successfully...")
|
|
361
|
+
|
|
362
|
+
# 上传各种文件到 s3 ---------------------------------------------------------------------------------------
|
|
363
|
+
# Update images to s3
|
|
364
|
+
for item_img in tqdm(all_local_image_path):
|
|
365
|
+
with requests.get(item_img["path_raw"], stream=True) as response:
|
|
366
|
+
response.raise_for_status() # 检查是否成功
|
|
367
|
+
self._s3_client.upload_fileobj(
|
|
368
|
+
response.raw,
|
|
369
|
+
s3_path_obj['bucket'],
|
|
370
|
+
s3_path_join(self.txt_vector, s3_path_obj['store_path'], item_img["path_s3"]),
|
|
371
|
+
ExtraArgs={'ContentType': get_content_type_4_filename(item_img["path_raw"])},
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# for item_img in all_local_image_path:
|
|
375
|
+
# self._s3_client.upload_file(
|
|
376
|
+
# os.path.join(input_folder, item_img),
|
|
377
|
+
# s3_path_obj['bucket'], s3_path_join(self.txt_vector, s3_path_obj['store_path'], 'images', item_img),
|
|
378
|
+
# ExtraArgs={'ContentType': get_content_type_4_filename(item_img)}
|
|
379
|
+
# )
|
|
380
|
+
logger.warning("[image] Store images result successfully...")
|
|
381
|
+
|
|
382
|
+
# Upload raw files to s3
|
|
383
|
+
self._s3_client.upload_file(
|
|
384
|
+
netmind_reg_file,
|
|
385
|
+
s3_path_obj['bucket'], s3_path_join(self.txt_vector, s3_path_obj['store_path'], f'netmind_reg.txt'),
|
|
386
|
+
ExtraArgs={'ContentType': ExtenCons.EXTEN_TEXT_TXT_UTF8.value}
|
|
387
|
+
)
|
|
388
|
+
logger.info("[raw] Store raw result successfully...")
|
|
389
|
+
|
|
390
|
+
# Upload pages files to s3
|
|
391
|
+
pages_txt_key = s3_path_join(self.txt_vector, s3_path_obj['store_path'], f'pages.txt')
|
|
392
|
+
self._s3_client.upload_file(os.path.join(input_folder, 'pages.txt'), s3_path_obj['bucket'], pages_txt_key,
|
|
393
|
+
ExtraArgs={'ContentType': ExtenCons.EXTEN_TEXT_TXT_UTF8.value})
|
|
394
|
+
if self.s3_util.check_file_exist(s3_path_obj["bucket"], pages_txt_key) is False:
|
|
395
|
+
raise Exception("[page] Store page result failed...")
|
|
396
|
+
logger.info("[page] Store page result successfully...")
|
|
397
|
+
|
|
398
|
+
# Upload blocks files to s3
|
|
399
|
+
blocks_txt_key = s3_path_join(self.txt_vector, s3_path_obj['store_path'], f'blocks.txt')
|
|
400
|
+
self._s3_client.upload_file(os.path.join(input_folder, f'blocks.txt'), s3_path_obj['bucket'], blocks_txt_key,
|
|
401
|
+
ExtraArgs={'ContentType': ExtenCons.EXTEN_TEXT_TXT_UTF8.value})
|
|
402
|
+
if self.s3_util.check_file_exist(s3_path_obj["bucket"], blocks_txt_key) is False:
|
|
403
|
+
raise Exception("[block] Store block result failed...")
|
|
404
|
+
logger.info("[block] Store block result successfully...")
|
|
405
|
+
|
|
406
|
+
extract_meta = {
|
|
407
|
+
"extraction": {
|
|
408
|
+
"version": "netmind",
|
|
409
|
+
"sub_version": "v1",
|
|
410
|
+
"finished_time": datetime.datetime.now(tz=pytz.timezone('UTC')).strftime('%Y-%m-%dT%H:%M:%S%z')
|
|
411
|
+
},
|
|
412
|
+
"metadata": {},
|
|
413
|
+
"others": {}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
object_put = self._s3_resource.Object(s3_path_obj['bucket'],
|
|
417
|
+
s3_path_join(self.txt_vector, s3_path_obj['store_path'],
|
|
418
|
+
f'metadata.txt'))
|
|
419
|
+
object_put.put(Body=json.dumps(extract_meta, ensure_ascii=False),
|
|
420
|
+
ContentType=ExtenCons.EXTEN_TEXT_TXT_UTF8.value)
|
|
421
|
+
logger.info("[meta] Store extract meta info successfully...")
|
|
422
|
+
|
|
423
|
+
def convert_2_block(self, block_raw):
|
|
424
|
+
f_block = {
|
|
425
|
+
"id": block_raw["id"],
|
|
426
|
+
"page": block_raw["page"],
|
|
427
|
+
"seq_no": block_raw["seq_no"],
|
|
428
|
+
"sentence": "",
|
|
429
|
+
"type": block_raw["type"],
|
|
430
|
+
"image_detail": [],
|
|
431
|
+
"text_location": {"location": block_raw["text_location"]["location"]},
|
|
432
|
+
}
|
|
433
|
+
if "sentence" in block_raw:
|
|
434
|
+
f_block["sentence"] = block_raw["sentence"]
|
|
435
|
+
|
|
436
|
+
if block_raw["type"] == "image":
|
|
437
|
+
f_block["image_detail"] = block_raw["image_detail"]
|
|
438
|
+
return f_block
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.32
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -28,6 +28,18 @@ Requires-Dist: prettytable>=3.0.0
|
|
|
28
28
|
Requires-Dist: pytz>=2022.1
|
|
29
29
|
Requires-Dist: Deprecated
|
|
30
30
|
Requires-Dist: func_timeout
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: author-email
|
|
33
|
+
Dynamic: classifier
|
|
34
|
+
Dynamic: description
|
|
35
|
+
Dynamic: description-content-type
|
|
36
|
+
Dynamic: home-page
|
|
37
|
+
Dynamic: license
|
|
38
|
+
Dynamic: maintainer
|
|
39
|
+
Dynamic: maintainer-email
|
|
40
|
+
Dynamic: platform
|
|
41
|
+
Dynamic: requires-dist
|
|
42
|
+
Dynamic: summary
|
|
31
43
|
|
|
32
44
|
# orbitkit
|
|
33
45
|
|
|
@@ -44,6 +44,7 @@ orbitkit/pdf_extractor/pdf_extractor_azure.py
|
|
|
44
44
|
orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py
|
|
45
45
|
orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py
|
|
46
46
|
orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py
|
|
47
|
+
orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py
|
|
47
48
|
orbitkit/pdf_extractor/pdf_extractor_orbit.py
|
|
48
49
|
orbitkit/pdf_extractor_simple/__init__.py
|
|
49
50
|
orbitkit/pdf_extractor_simple/base.py
|
orbitkit-0.8.31/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.31
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|