orbitkit 0.8.59__tar.gz → 0.8.61__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {orbitkit-0.8.59/orbitkit.egg-info → orbitkit-0.8.61}/PKG-INFO +1 -1
  2. orbitkit-0.8.61/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/audio_transcoder/netmind_extract_v1.py +13 -6
  4. orbitkit-0.8.61/orbitkit/pdf_extractor/mineru_demo.py +133 -0
  5. orbitkit-0.8.61/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +1105 -0
  6. {orbitkit-0.8.59 → orbitkit-0.8.61/orbitkit.egg-info}/PKG-INFO +1 -1
  7. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit.egg-info/SOURCES.txt +1 -0
  8. orbitkit-0.8.59/orbitkit/VERSION +0 -1
  9. orbitkit-0.8.59/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -259
  10. {orbitkit-0.8.59 → orbitkit-0.8.61}/LICENSE +0 -0
  11. {orbitkit-0.8.59 → orbitkit-0.8.61}/MANIFEST.in +0 -0
  12. {orbitkit-0.8.59 → orbitkit-0.8.61}/README.md +0 -0
  13. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/__init__.py +0 -0
  14. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/airflow_handler/__init__.py +0 -0
  15. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
  16. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
  17. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  18. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/airflow_handler/file_handler.py +0 -0
  19. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  20. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/audio_transcoder/__init__.py +0 -0
  21. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/constant/__init__.py +0 -0
  22. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/constant/report_schema.py +0 -0
  23. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/id_srv/__init__.py +0 -0
  24. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/id_srv/id_gen.py +0 -0
  25. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/id_srv/id_perm_like.py +0 -0
  26. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/lark_send/__init__.py +0 -0
  27. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/lark_send/lark.py +0 -0
  28. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/llm_tools/__init__.py +0 -0
  29. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  30. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/orbit_type/__init__.py +0 -0
  31. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  32. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  33. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/orbit_type/tools.py +0 -0
  34. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_embedding/__init__.py +0 -0
  35. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  36. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  37. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/__init__.py +0 -0
  38. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  39. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/exceptions.py +0 -0
  40. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  41. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  42. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  43. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  44. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  45. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  46. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  47. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  48. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  49. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/base.py +0 -0
  50. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  51. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/core.py +0 -0
  52. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  53. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  54. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  55. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_writer/__init__.py +0 -0
  56. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  57. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/__init__.py +0 -0
  58. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/cache_asset_downloader.py +0 -0
  59. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/common.py +0 -0
  60. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/customize_regix_manager.py +0 -0
  61. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/secret_manager.py +0 -0
  62. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/universal_extractor.py +0 -0
  63. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_aliyun.py +0 -0
  64. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  65. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_aws.py +0 -0
  66. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  67. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_date.py +0 -0
  68. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_html.py +0 -0
  69. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_kafka.py +0 -0
  70. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_md5.py +0 -0
  71. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_selenium.py +0 -0
  72. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_simple_timer.py +0 -0
  73. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_str.py +0 -0
  74. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_type_mapping.py +0 -0
  75. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit/util/util_url.py +0 -0
  76. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit.egg-info/dependency_links.txt +0 -0
  77. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit.egg-info/not-zip-safe +0 -0
  78. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit.egg-info/requires.txt +0 -0
  79. {orbitkit-0.8.59 → orbitkit-0.8.61}/orbitkit.egg-info/top_level.txt +0 -0
  80. {orbitkit-0.8.59 → orbitkit-0.8.61}/setup.cfg +0 -0
  81. {orbitkit-0.8.59 → orbitkit-0.8.61}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.59
3
+ Version: 0.8.61
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.61
@@ -104,7 +104,7 @@ def text_processing(netmind_data, lang, translate_model='gpt-4.1-mini'):
104
104
  def send_request_to_api(s3_remote_url, **kwargs):
105
105
  endpoint = kwargs['endpoint']
106
106
  token = kwargs['token']
107
- lang = kwargs.get('lang', 'en')
107
+ lang = kwargs.get('lang', None)
108
108
  headers = {
109
109
  'Authorization': token,
110
110
  }
@@ -112,8 +112,9 @@ def send_request_to_api(s3_remote_url, **kwargs):
112
112
  files = {
113
113
  'model': (None, 'WhisperX'),
114
114
  'url': (None, s3_remote_url),
115
- 'language': (None, lang),
116
115
  }
116
+ if lang:
117
+ files['language'] = (None, lang)
117
118
 
118
119
  response = requests.post(endpoint, headers=headers, files=files)
119
120
  response.raise_for_status()
@@ -125,7 +126,7 @@ def send_request_to_api(s3_remote_url, **kwargs):
125
126
  def send_request_to_stream(file_steam, **kwargs):
126
127
  endpoint = kwargs['endpoint']
127
128
  token = kwargs['token']
128
- lang = kwargs.get('lang', 'en')
129
+ lang = kwargs.get('lang', None)
129
130
  headers = {
130
131
  'Authorization': token,
131
132
  }
@@ -133,8 +134,9 @@ def send_request_to_stream(file_steam, **kwargs):
133
134
  files = {
134
135
  'model': (None, 'WhisperX'),
135
136
  'files': (None, file_steam),
136
- 'language': (None, lang),
137
137
  }
138
+ if lang:
139
+ files['language'] = (None, lang)
138
140
 
139
141
  response = requests.post(endpoint, headers=headers, files=files)
140
142
  response.raise_for_status()
@@ -143,7 +145,7 @@ def send_request_to_stream(file_steam, **kwargs):
143
145
 
144
146
 
145
147
  def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs):
146
- lang = kwargs.get('lang', 'en')
148
+ lang = kwargs.get('lang', None)
147
149
  folder = kwargs.get('folder', '')
148
150
  translate_model = kwargs.get('translate_model', 'gpt-4.1-mini')
149
151
  if s3_path:
@@ -166,7 +168,12 @@ def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs)
166
168
  with open(json_netmind_wav_path, 'w', encoding='utf-8') as json_file:
167
169
  json.dump(data, json_file, ensure_ascii=False, indent=4)
168
170
 
169
- net_process = text_processing(data, lang, translate_model=translate_model)
171
+ # lang None 时跳过翻译,直接使用原始结果
172
+ if lang:
173
+ net_process = text_processing(data, lang, translate_model=translate_model)
174
+ else:
175
+ logger.info("lang=None, 跳过翻译,保持原语言")
176
+ net_process = data
170
177
 
171
178
  # 翻译接口处理
172
179
  json_netmind_lang_wav_path = os.path.join(folder, 'netmind_lang_wav.json')
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ MinerU PDF 解析工具使用示例
5
+ """
6
+ import os
7
+ import tempfile
8
+ import boto3
9
+ from urllib.parse import urlparse
10
+ from pdf_extractor_minerU_v1 import MinerUSingleTask
11
+
12
+
13
+ def process_pdf(s3_path: str, api_token: str,
14
+ supported_versions: list = None, model_version: str = "vlm",
15
+ is_ocr: bool = True, enable_formula: bool = True,
16
+ enable_table: bool = True, timeout: int = 1800,
17
+ keep_all_files: bool = True) -> str:
18
+ """
19
+ 处理 S3 上的 PDF 文件
20
+
21
+ :param s3_path: S3 路径 (如 s3://bucket/path/to/file.pdf)
22
+ :param api_token: MinerU API Token
23
+ :param supported_versions: 支持的版本列表,如 ["2.7.3"],None 表示不检查
24
+ :param model_version: 模型版本 "pipeline" | "vlm" | "MinerU-HTML"
25
+ :param is_ocr: 是否启用 OCR
26
+ :param enable_formula: 是否启用公式识别
27
+ :param enable_table: 是否启用表格识别
28
+ :param timeout: 超时时间(秒)
29
+ :param keep_all_files: 是否保留所有文件
30
+ :return: S3 输出路径 (txt-vector/...)
31
+ """
32
+
33
+ # 版本检查回调
34
+ def check_version(version_info):
35
+ if supported_versions is None:
36
+ return
37
+ version = version_info.get("_version_name", "")
38
+ if version not in supported_versions:
39
+ raise Exception(
40
+ f"不支持的 MinerU API 版本: {version}\n"
41
+ f"当前支持: {supported_versions}\n"
42
+ f"请更新代码以支持新版本的 block 类型"
43
+ )
44
+ print(f"版本检查通过: {version}")
45
+
46
+ # 解析 S3 路径
47
+ parsed = urlparse(s3_path)
48
+ bucket = parsed.netloc
49
+ key = parsed.path.lstrip('/')
50
+ filename = os.path.basename(key)
51
+
52
+ # 初始化客户端
53
+ version_checker = check_version if supported_versions else None
54
+ client = MinerUSingleTask(api_token, version_checker=version_checker)
55
+ s3_client = boto3.client('s3')
56
+
57
+ print("=" * 60)
58
+ print("MinerU PDF 解析工具")
59
+ print("=" * 60)
60
+ print(f"输入: {s3_path}")
61
+
62
+ # 第零步:从 S3 下载 PDF 到本地
63
+ print("\n[步骤0] 从 S3 下载 PDF...")
64
+ with tempfile.TemporaryDirectory() as temp_dir:
65
+ local_pdf_path = os.path.join(temp_dir, filename)
66
+ s3_client.download_file(bucket, key, local_pdf_path)
67
+ print(f"下载完成: {local_pdf_path}")
68
+
69
+ # 第一步:上传文件
70
+ print("\n[步骤1] 上传文件到 MinerU...")
71
+ tasks = client.submit_task(
72
+ local_pdf_path,
73
+ model_version=model_version,
74
+ is_ocr=is_ocr,
75
+ enable_formula=enable_formula,
76
+ enable_table=enable_table
77
+ )
78
+
79
+ # 第二步:等待任务完成
80
+ print("\n[步骤2] 等待任务完成...")
81
+ results = client.wait_for_complete(tasks, timeout=timeout)
82
+
83
+ # 第三步:下载并处理结果(输出到 txt-vector/{原S3路径})
84
+ print("\n[步骤3] 下载并处理结果...")
85
+ output_path = client.download_and_process(
86
+ results,
87
+ s3_path, # 使用原 S3 路径生成输出目录
88
+ s3_client,
89
+ keep_all_files=keep_all_files
90
+ )
91
+
92
+ print("\n" + "=" * 60)
93
+ print(f"完成! 结果已上传到: {output_path}/")
94
+ print("=" * 60)
95
+
96
+ return output_path
97
+
98
+
99
+ if __name__ == '__main__':
100
+ # ==================== 配置参数 ====================
101
+
102
+ # MinerU API Token(从 https://mineru.net 获取)
103
+ API_TOKEN = "你的API Token"
104
+
105
+ # PDF 文件的 S3 路径
106
+ S3_PATH = "s3://your-bucket/path/to/file.pdf"
107
+
108
+ # 支持的 API 版本(None 表示不检查版本)
109
+ SUPPORTED_VERSIONS = ["2.7.3"]
110
+
111
+ # 模型版本: "pipeline" | "vlm" | "MinerU-HTML"
112
+ MODEL_VERSION = "vlm"
113
+
114
+ # 其他选项
115
+ IS_OCR = True
116
+ ENABLE_FORMULA = True
117
+ ENABLE_TABLE = True
118
+ TIMEOUT = 1800
119
+ KEEP_ALL_FILES = True
120
+
121
+ # ==================== 执行 ====================
122
+
123
+ process_pdf(
124
+ s3_path=S3_PATH,
125
+ api_token=API_TOKEN,
126
+ supported_versions=SUPPORTED_VERSIONS,
127
+ model_version=MODEL_VERSION,
128
+ is_ocr=IS_OCR,
129
+ enable_formula=ENABLE_FORMULA,
130
+ enable_table=ENABLE_TABLE,
131
+ timeout=TIMEOUT,
132
+ keep_all_files=KEEP_ALL_FILES
133
+ )