orbitkit 0.8.35__tar.gz → 0.8.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.35/orbitkit.egg-info → orbitkit-0.8.37}/PKG-INFO +2 -14
- orbitkit-0.8.37/orbitkit/VERSION +1 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +9 -4
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/__init__.py +3 -0
- orbitkit-0.8.37/orbitkit/util/cache_asset_downloader.py +132 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37/orbitkit.egg-info}/PKG-INFO +2 -14
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit.egg-info/SOURCES.txt +1 -0
- orbitkit-0.8.35/orbitkit/VERSION +0 -1
- {orbitkit-0.8.35 → orbitkit-0.8.37}/LICENSE +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/MANIFEST.in +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/README.md +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/setup.cfg +0 -0
- {orbitkit-0.8.35 → orbitkit-0.8.37}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.37
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -28,18 +28,6 @@ Requires-Dist: prettytable>=3.0.0
|
|
|
28
28
|
Requires-Dist: pytz>=2022.1
|
|
29
29
|
Requires-Dist: Deprecated
|
|
30
30
|
Requires-Dist: func_timeout
|
|
31
|
-
Dynamic: author
|
|
32
|
-
Dynamic: author-email
|
|
33
|
-
Dynamic: classifier
|
|
34
|
-
Dynamic: description
|
|
35
|
-
Dynamic: description-content-type
|
|
36
|
-
Dynamic: home-page
|
|
37
|
-
Dynamic: license
|
|
38
|
-
Dynamic: maintainer
|
|
39
|
-
Dynamic: maintainer-email
|
|
40
|
-
Dynamic: platform
|
|
41
|
-
Dynamic: requires-dist
|
|
42
|
-
Dynamic: summary
|
|
43
31
|
|
|
44
32
|
# orbitkit
|
|
45
33
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.37
|
|
@@ -197,6 +197,7 @@ class PdfExtractorNetmindMerge:
|
|
|
197
197
|
temp_folder: Optional[str] = None,
|
|
198
198
|
s3_util: Optional[S3Util] = None,
|
|
199
199
|
txt_vector: str = 'txt-vector',
|
|
200
|
+
is_page_number_discontinuity_exception_thrown: bool = False, #页码不连续异常抛出
|
|
200
201
|
slice_option: Optional[SplitPageOptions] = SplitPageOptions(),
|
|
201
202
|
**kwargs):
|
|
202
203
|
self.aws_access_key_id = get_from_dict_or_env(kwargs, "aws_access_key_id", "AWS_ACCESS_KEY_ID")
|
|
@@ -205,6 +206,7 @@ class PdfExtractorNetmindMerge:
|
|
|
205
206
|
self.txt_vector = txt_vector
|
|
206
207
|
self.slice_option = slice_option
|
|
207
208
|
self.source_s3_path = source_s3_path
|
|
209
|
+
self.is_page_number_discontinuity_exception_thrown = is_page_number_discontinuity_exception_thrown
|
|
208
210
|
if not source_s3_path:
|
|
209
211
|
raise Exception('not params source source_s3_path')
|
|
210
212
|
if s3_util:
|
|
@@ -266,10 +268,13 @@ class PdfExtractorNetmindMerge:
|
|
|
266
268
|
for i in range(1, len(all_pages)):
|
|
267
269
|
if all_pages[i] != all_pages[i - 1] + 1:
|
|
268
270
|
missing_pages = list(range(all_pages[i - 1] + 1, all_pages[i]))
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
271
|
+
if self.is_page_number_discontinuity_exception_thrown:
|
|
272
|
+
raise ValueError(
|
|
273
|
+
f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,"
|
|
274
|
+
f"缺少页码: {missing_pages}"
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
print(f"页码不连续错误!在 {all_pages[i - 1]} 页之后直接出现了 {all_pages[i]} 页,缺少页码: {missing_pages}")
|
|
273
278
|
logger.info("[JSON] Merge json result successfully...")
|
|
274
279
|
return sorted_file_arr
|
|
275
280
|
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import platform
|
|
3
|
+
import hashlib
|
|
4
|
+
import requests
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
功能描述
|
|
11
|
+
CacheAssetDownloader 是一个智能的资源下载器,主要用于管理大型文件(如机器学习模型、数据集等)的下载和本地缓存。
|
|
12
|
+
|
|
13
|
+
跨平台缓存管理:
|
|
14
|
+
Windows:使用 %LOCALAPPDATA% 或 用户目录/AppData/Local
|
|
15
|
+
macOS:使用 ~/Library/Caches
|
|
16
|
+
Linux/Docker:使用 $XDG_CACHE_HOME 或 ~/.cache
|
|
17
|
+
|
|
18
|
+
智能缓存机制:
|
|
19
|
+
自动检查本地缓存是否存在
|
|
20
|
+
支持 MD5 哈希验证确保文件完整性
|
|
21
|
+
避免重复下载,节省带宽和时间
|
|
22
|
+
|
|
23
|
+
安全下载:
|
|
24
|
+
流式下载,适合大文件
|
|
25
|
+
异常处理和错误恢复
|
|
26
|
+
文件验证失败时自动清理
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CacheAssetDownloader:
|
|
31
|
+
def __init__(self, asset_sub_folder: str, download_url: str, expected_hash: Optional[str] = None):
|
|
32
|
+
self.asset_sub_folder = asset_sub_folder
|
|
33
|
+
self.download_url = download_url
|
|
34
|
+
self.expected_hash = expected_hash
|
|
35
|
+
|
|
36
|
+
# 设置缓存目录
|
|
37
|
+
self.cache_dir = self._get_cache_dir()
|
|
38
|
+
self.asset_path = None
|
|
39
|
+
|
|
40
|
+
# 自动下载资源
|
|
41
|
+
self._download_if_needed()
|
|
42
|
+
|
|
43
|
+
def _get_cache_dir(self) -> Path:
|
|
44
|
+
"""获取缓存目录"""
|
|
45
|
+
system = platform.system().lower()
|
|
46
|
+
|
|
47
|
+
if system == 'windows':
|
|
48
|
+
base_dir = Path(os.environ.get('LOCALAPPDATA', Path.home() / 'AppData' / 'Local'))
|
|
49
|
+
elif system == 'darwin': # macOS
|
|
50
|
+
base_dir = Path.home() / 'Library' / 'Caches'
|
|
51
|
+
else: # Linux/Docker
|
|
52
|
+
base_dir = Path(os.environ.get('XDG_CACHE_HOME', Path.home() / '.cache'))
|
|
53
|
+
|
|
54
|
+
cache_dir = base_dir / 'orbit_assets' / self.asset_sub_folder
|
|
55
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
print(f"You current system: >>>{system}<<<, with cache path: >>>{cache_dir}<<<")
|
|
58
|
+
return cache_dir
|
|
59
|
+
|
|
60
|
+
def _validate_file(self, file_path: Path) -> bool:
|
|
61
|
+
"""验证文件"""
|
|
62
|
+
if not file_path.exists():
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
if not self.expected_hash: # 如果没开启 hash 验证,则直接返回
|
|
66
|
+
return True
|
|
67
|
+
|
|
68
|
+
# 计算MD5
|
|
69
|
+
hash_md5 = hashlib.md5()
|
|
70
|
+
with open(file_path, 'rb') as f:
|
|
71
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
72
|
+
hash_md5.update(chunk)
|
|
73
|
+
|
|
74
|
+
return hash_md5.hexdigest().lower() == self.expected_hash.lower()
|
|
75
|
+
|
|
76
|
+
def _download_file(self, url: str, target_path: Path) -> bool:
|
|
77
|
+
"""下载文件"""
|
|
78
|
+
try:
|
|
79
|
+
print(f"正在下载: {url}")
|
|
80
|
+
response = requests.get(url, stream=True)
|
|
81
|
+
response.raise_for_status()
|
|
82
|
+
|
|
83
|
+
with open(target_path, 'wb') as f:
|
|
84
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
85
|
+
if chunk:
|
|
86
|
+
f.write(chunk)
|
|
87
|
+
|
|
88
|
+
print(f"下载完成: {target_path}")
|
|
89
|
+
return True
|
|
90
|
+
except Exception as e:
|
|
91
|
+
print(f"下载失败: {e}")
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
def _download_if_needed(self):
|
|
95
|
+
"""检查并下载资源"""
|
|
96
|
+
# 确定文件名
|
|
97
|
+
filename = os.path.basename(urlparse(self.download_url).path)
|
|
98
|
+
asset_file = self.cache_dir / filename
|
|
99
|
+
|
|
100
|
+
# 检查缓存
|
|
101
|
+
if self._validate_file(asset_file):
|
|
102
|
+
print(f"使用缓存文件: {asset_file}")
|
|
103
|
+
self.asset_path = asset_file
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
# 下载文件
|
|
107
|
+
if not self._download_file(self.download_url, asset_file):
|
|
108
|
+
raise RuntimeError(f"下载失败: {self.download_url}")
|
|
109
|
+
|
|
110
|
+
# 验证文件
|
|
111
|
+
if not self._validate_file(asset_file):
|
|
112
|
+
asset_file.unlink(missing_ok=True)
|
|
113
|
+
raise RuntimeError("文件验证失败")
|
|
114
|
+
|
|
115
|
+
self.asset_path = asset_file
|
|
116
|
+
|
|
117
|
+
def get_path(self) -> Path:
|
|
118
|
+
"""获取资源路径"""
|
|
119
|
+
return self.asset_path
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
if __name__ == "__main__":
|
|
123
|
+
downloader = CacheAssetDownloader(
|
|
124
|
+
asset_sub_folder="fasttext",
|
|
125
|
+
download_url="https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin",
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# cache_dir = downloader._get_cache_dir()
|
|
129
|
+
# print(cache_dir)
|
|
130
|
+
|
|
131
|
+
model_path = downloader.get_path()
|
|
132
|
+
print(f"模型路径: {model_path}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.37
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -28,18 +28,6 @@ Requires-Dist: prettytable>=3.0.0
|
|
|
28
28
|
Requires-Dist: pytz>=2022.1
|
|
29
29
|
Requires-Dist: Deprecated
|
|
30
30
|
Requires-Dist: func_timeout
|
|
31
|
-
Dynamic: author
|
|
32
|
-
Dynamic: author-email
|
|
33
|
-
Dynamic: classifier
|
|
34
|
-
Dynamic: description
|
|
35
|
-
Dynamic: description-content-type
|
|
36
|
-
Dynamic: home-page
|
|
37
|
-
Dynamic: license
|
|
38
|
-
Dynamic: maintainer
|
|
39
|
-
Dynamic: maintainer-email
|
|
40
|
-
Dynamic: platform
|
|
41
|
-
Dynamic: requires-dist
|
|
42
|
-
Dynamic: summary
|
|
43
31
|
|
|
44
32
|
# orbitkit
|
|
45
33
|
|
|
@@ -56,6 +56,7 @@ orbitkit/pdf_extractor_simple/utils.py
|
|
|
56
56
|
orbitkit/pdf_writer/__init__.py
|
|
57
57
|
orbitkit/pdf_writer/pdf_writer_simple.py
|
|
58
58
|
orbitkit/util/__init__.py
|
|
59
|
+
orbitkit/util/cache_asset_downloader.py
|
|
59
60
|
orbitkit/util/common.py
|
|
60
61
|
orbitkit/util/customize_regix_manager.py
|
|
61
62
|
orbitkit/util/secret_manager.py
|
orbitkit-0.8.35/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.35
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|