PyPI - orbitkit - Versions diffs - 0.8.60__tar.gz → 0.8.62__tar.gz - Mend

orbitkit 0.8.60tar.gz → 0.8.62tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

{orbitkit-0.8.60/orbitkit.egg-info → orbitkit-0.8.62}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: orbitkit
-Version: 0.8.60
+Version: 0.8.62
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao

orbitkit-0.8.62/orbitkit/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.8.62

{orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/audio_transcoder/netmind_extract_v1.py RENAMED Viewed

@@ -104,7 +104,7 @@ def text_processing(netmind_data, lang, translate_model='gpt-4.1-mini'):
 def send_request_to_api(s3_remote_url, **kwargs):
     endpoint = kwargs['endpoint']
     token = kwargs['token']
-    lang = kwargs.get('lang', 'en')
+    lang = kwargs.get('lang', None)
     headers = {
         'Authorization': token,
     }
@@ -112,8 +112,9 @@ def send_request_to_api(s3_remote_url, **kwargs):
     files = {
         'model': (None, 'WhisperX'),
         'url': (None, s3_remote_url),
-        'language': (None, lang),
     }
+    if lang:
+        files['language'] = (None, lang)
     response = requests.post(endpoint, headers=headers, files=files)
     response.raise_for_status()
@@ -125,7 +126,7 @@ def send_request_to_api(s3_remote_url, **kwargs):
 def send_request_to_stream(file_steam, **kwargs):
     endpoint = kwargs['endpoint']
     token = kwargs['token']
-    lang = kwargs.get('lang', 'en')
+    lang = kwargs.get('lang', None)
     headers = {
         'Authorization': token,
     }
@@ -133,8 +134,9 @@ def send_request_to_stream(file_steam, **kwargs):
     files = {
         'model': (None, 'WhisperX'),
         'files': (None, file_steam),
-        'language': (None, lang),
     }
+    if lang:
+        files['language'] = (None, lang)
     response = requests.post(endpoint, headers=headers, files=files)
     response.raise_for_status()
@@ -143,7 +145,7 @@ def send_request_to_stream(file_steam, **kwargs):
 def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs):
-    lang = kwargs.get('lang', 'en')
+    lang = kwargs.get('lang', None)
     folder = kwargs.get('folder', '')
     translate_model = kwargs.get('translate_model', 'gpt-4.1-mini')
     if s3_path:
@@ -166,7 +168,12 @@ def request_wav_from_netmind(s3_client, s3_path=None, file_steam=None, **kwargs)
     with open(json_netmind_wav_path, 'w', encoding='utf-8') as json_file:
         json.dump(data, json_file, ensure_ascii=False, indent=4)
-    net_process = text_processing(data, lang, translate_model=translate_model)
+    # lang 为 None 时跳过翻译，直接使用原始结果
+    if lang:
+        net_process = text_processing(data, lang, translate_model=translate_model)
+    else:
+        logger.info("lang=None, 跳过翻译，保持原语言")
+        net_process = data
     # 翻译接口处理
     json_netmind_lang_wav_path = os.path.join(folder, 'netmind_lang_wav.json')

{orbitkit-0.8.60 → orbitkit-0.8.62}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py RENAMED Viewed

@@ -338,12 +338,10 @@ class MinerUExtract:
         return self.default_extractable(pdf_pages, 'discarded_blocks', pages_body, block_seq)
     def default_blocks(self, pdf_pages):
-        print(pdf_pages.keys())
-        raise Exception('提取block方法异常')
+        raise Exception(f'提取block方法异常，未知参数: {list(pdf_pages.keys())}')
     def default_extractable(self, pdf_pages, block_key, pages_body, block_seq):
         for pages_block in pdf_pages.get(block_key, []):
-            print(pages_block['type'])
             handler_type_func = getattr(self, f'level_two_{pages_block["type"]}', self.level_two_default_blocks)
             block_seq, pages_body = handler_type_func(block_seq, pages_block, pages_body)
         return pages_body
@@ -431,9 +429,10 @@ class MinerUExtract:
         return self.level_two_txt_com(block_seq, pages_block, pages_body, 'sentence')
     def level_two_table(self, block_seq, pages_block, pages_body):
-        """处理 table 类型 block，sentence 直接存储 HTML 格式"""
+        """处理 table 类型 block，sentence 直接存储 HTML 格式，脚注等文本单独生成 block"""
         table_blocks = pages_block.get('blocks', [])
         table_html = ""
+        text_contents = []  # 收集 table_footnote 等文本内容
         _image_detail_arr = []
         bbox = pages_block.get('bbox', [0, 0, 0, 0])
@@ -444,23 +443,42 @@ class MinerUExtract:
                     span_type = _span.get('type', '')
                     if span_type == 'table' and 'html' in _span:
                         table_html = _span['html']
+                    elif 'content' in _span:  # table_footnote 等文本类型
+                        text_contents.append(_span['content'])
                     if _span.get('image_path'):
                         _image_detail_arr.append({"path": f'images/{_span["image_path"]}', "desc": ""})
-        _block_str = table_html
-        if _block_str or _image_detail_arr:
+        # 表格 HTML 单独一个 block
+        if table_html or _image_detail_arr:
             block_seq = block_seq + 1
             text_location = self.get_location(bbox)
             self.blocks.append({
                 "id": id_srv.get_random_short_id(),
                 "page": self.page,
                 "seq_no": block_seq,
-                "sentence": _block_str,
+                "sentence": table_html,
                 "type": 'table',
                 "image_detail": _image_detail_arr,
                 "text_location": text_location
             })
-            pages_body.append(_block_str)
+            pages_body.append(table_html)
+        # 脚注等文本单独一个 block
+        if text_contents:
+            block_seq = block_seq + 1
+            text_location = self.get_location(bbox)
+            _text_str = '\n'.join(text_contents)
+            self.blocks.append({
+                "id": id_srv.get_random_short_id(),
+                "page": self.page,
+                "seq_no": block_seq,
+                "sentence": _text_str,
+                "type": 'sentence',
+                "image_detail": [],
+                "text_location": text_location
+            })
+            pages_body.append(_text_str)
         return block_seq, pages_body
     def level_two_image(self, block_seq, pages_block, pages_body):
@@ -541,8 +559,7 @@ class MinerUExtract:
         return block_seq, pages_body
     def level_two_default_blocks(self, block_seq, pages_block, pages_body):
-        print(pages_block.keys())
-        raise Exception('提取block方法异常')
+        raise Exception(f'提取block方法异常，未知类型: {pages_block.get("type")}，参数: {list(pages_block.keys())}')
     def get_com_lines(self, pages_lines):
         _block_arr = []

{orbitkit-0.8.60 → orbitkit-0.8.62/orbitkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: orbitkit
-Version: 0.8.60
+Version: 0.8.62
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao