PyPI - orbitkit - Versions diffs - 0.8.40__tar.gz → 0.8.42__tar.gz - Mend

orbitkit 0.8.40tar.gz → 0.8.42tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{orbitkit-0.8.40/orbitkit.egg-info → orbitkit-0.8.42}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.40
+Version: 0.8.42
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao

orbitkit-0.8.42/orbitkit/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.8.42

{orbitkit-0.8.40 → orbitkit-0.8.42}/orbitkit/airflow_handler/data_preprocessing.py RENAMED Viewed

@@ -1,13 +1,13 @@
 import os
 import datetime
 from collections import defaultdict
-import asyncio
-from googletrans import Translator
+from importlib.metadata import version
+import googletrans
 class DocumentProcessor:
-    _translator = Translator()
+    if version("googletrans") < "4.0.2":
+        raise ImportError(f"googletrans >= 4.0.2 is required for async support. {version('googletrans')}")
     AUDIO_SUFFIXES = [".mp3", ".wav", ".aac", ".wma", ".m4a"]
     VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
     PDF_SUFFIXES = [".pdf"]
@@ -21,15 +21,13 @@ class DocumentProcessor:
     def get_file_suffix(file_path):
         return f".{file_path.split('.')[-1]}".lower()
-    @classmethod
-    async def translate(cls, text, dest='en'):
-        result = await cls._translator.translate(text, dest=dest)
+    @staticmethod
+    async def translate_text(text, dest='en'):
+        """异步翻译函数 https://pypi.org/project/googletrans/"""
+        translator = googletrans.Translator()
+        result = await translator.translate(text, dest=dest)
         return result.text
-    @classmethod
-    def translate_sync(cls, text, dest='en'):
-        return asyncio.run(cls.translate(text, dest))
     @staticmethod
     def create_xbrl_template():
         return {
@@ -93,12 +91,15 @@ class DocumentProcessor:
         return False, None
     @staticmethod
-    def get_start_stage_target_stage(doc):
+    def get_start_stage_target_stage(doc, custom_process_step_list):
         status_info = doc.get('x_status_list', {}).get('status_convert', {})
         status = status_info.get('status')
         status_txt = status_info.get('status_txt')
         x_spider_name = doc['x_spider_name']
+        if custom_process_step_list:
+            return custom_process_step_list[0], custom_process_step_list[1], x_spider_name
         if status != 'convert_done':
             return 'convert', 'embedding', x_spider_name
@@ -111,12 +112,12 @@ class DocumentProcessor:
         return 'success', 'success', x_spider_name
     @staticmethod
-    def update_target_stage_for_report_type(doc, target_stage):
+    def update_target_stage_by_report_type(doc, target_stage):
         report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
         return "extract" if report_type_ids == ['19999'] else target_stage
     @staticmethod
-    def update_target_stage_for_reported_at(doc, target_stage):
+    def update_target_stage_by_reported_at(doc, target_stage):
         date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
         try:
             reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
@@ -125,7 +126,7 @@ class DocumentProcessor:
         return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
     @classmethod
-    def create_record(cls, doc, start_stage):
+    async def create_record(cls, doc, start_stage):
         attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
         s3_path_info = []
         add_extends = {}
@@ -138,7 +139,7 @@ class DocumentProcessor:
                 if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
                     add_extends = {
                         "original_title": doc['x_orbit_data']['report_title'],
-                        "title": cls.translate_sync(doc['x_orbit_data']['report_title']),
+                        "title": await cls.translate_text(text=doc['x_orbit_data']['report_title']),
                         "published": doc['x_reported_at_utc_date'],
                         "tickers": [],
                         "perm_id_list": doc['x_orbit_data']['perm_id_list'],
@@ -164,7 +165,7 @@ class DocumentProcessor:
         }
     @classmethod
-    def process(cls, doc, check_doc):
+    async def process(cls, doc, custom_process_step):
         report_id = doc['_id']
         # 筛选文件
         doc = cls.stock_us_filter_by_is_primary(doc)
@@ -178,19 +179,23 @@ class DocumentProcessor:
         if is_xbrl:
             return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
-        start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
+        start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
-        # 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
-        if target_stage == 'embedding' and check_doc:
-            target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
-            target_stage = cls.update_target_stage_for_reported_at(doc, target_stage)
+        # 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
+        if target_stage == 'embedding' and not custom_process_step:
+            target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
+            target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
+            # 特殊情况下只需要做提取但是这个数据被过滤不需要做embedding
+            if start_stage == 'embedding' and target_stage == 'extract':
+                start_stage = 'success'
+                target_stage = 'success'
         if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
             return cls.create_result_info("step_error",
                                           "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
                                           doc['_id'])
-        file_name_check_status, record = cls.create_record(doc, start_stage)
+        file_name_check_status, record = await cls.create_record(doc, start_stage)
         if not file_name_check_status:
             return cls.create_result_info("step_error", "Document file name too lang.", report_id)

{orbitkit-0.8.40 → orbitkit-0.8.42}/orbitkit/airflow_handler/file_flow_entry_process.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 from collections import Counter
 from datetime import datetime
-from typing import Literal
+from typing import Literal, Optional
 import logging
 import pymongo
 import pytz
@@ -30,15 +30,13 @@ class FilingOfficialProcessor:
         self.mongo_client = pymongo.MongoClient(mongo_uri)
         self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
         self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
-        self.filing_reports_astock_test0822_collection = self.mongo_client['filing_reports']['filing_reports_astock_test0822']
         self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
             'annotation_reports_view_rows']
         self.source_map = {
             'filing_data': (self.filing_data_collection, 'filing_data'),
-            'filing_reports_astock_test0822': (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822'),
+            'G7_demo': (self.filing_data_collection, 'G7_demo'),
             'reports_view': [
-                (self.filing_data_collection, 'filing_data'),
-                (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822')
+                (self.filing_data_collection, 'filing_data')
             ]
         }
         self.only_low_important_set = {'internal_seekingalpha'}
@@ -122,7 +120,7 @@ class FilingOfficialProcessor:
             "x_status_list.status_convert.status_meta": "meta_init",
             "x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
         }})
-        logger.info(f'The document file type cannot be converted.')
+        logger.info(f'Unable to convert {len(report_id_list)} document(s) due to unsupported file type.')
     def update_extends_fields(self, perm_id_list, file_flow_info):
         stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
@@ -178,9 +176,19 @@ class FilingOfficialProcessor:
             logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
-    def process_task_entry(self, source: Literal["filing_data", "filing_reports_astock_test0822", "reports_view"],
+    async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
                            query: dict, tags: list[str], priority: str,
-                           is_important: bool = False, check_doc: bool = True):
+                           is_important: bool = False, custom_step: Optional[list[str]] = None):
+        allowed_steps = {"convert", "extract", "embedding"}
+        if custom_step is not None:
+            if not isinstance(custom_step, list):
+                raise ValueError("custom_step must be a list or None.")
+            if len(custom_step) > 2:
+                raise ValueError("custom_step can contain at most two elements.")
+            for step in custom_step:
+                if step not in allowed_steps:
+                    raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
         if source == 'reports_view':
             collections = self.source_map[source]
@@ -199,7 +207,8 @@ class FilingOfficialProcessor:
                 self.all_stat_count['all'] += 1
                 for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
                     perm_id_set.add(orbit_entity_id)
-                process_data.append(self.data_processor.process(doc, check_doc))
+                result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step)
+                process_data.append(result_record)
                 if len(process_data) >= self.max_batch_size:
                     file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
                         process_data)
@@ -226,4 +235,5 @@ class FilingOfficialProcessor:
                 process_data.clear()
                 perm_id_set.clear()
-        logger.info(f"finish processing {self.all_stat_count}.")
+        logger.info(f"finish processing {self.all_stat_count}. \n")
+        self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}

{orbitkit-0.8.40 → orbitkit-0.8.42}/orbitkit/airflow_handler/file_flow_exit_process.py RENAMED Viewed

@@ -27,9 +27,9 @@ class FlowUpdater:
             setattr(self, collection_name, self.coon[data_source])
         return getattr(self, collection_name)
-    def _handle_convert(self, status, attachments, db_store_path_set):
-        if not attachments:
-            raise ValueError("No attachments provided.")
+    def _handle_convert(self, status, attachments, db_store_path_set, attachments_pdf):
+        if not attachments or not attachments_pdf:
+            raise ValueError("Missing attachments: neither 'attachments' nor 'attachments_pdf' was provided.")
         if not status:
             return {
                 'x_status_list.status_convert.status': 'convert_failed',
@@ -38,15 +38,18 @@ class FlowUpdater:
             }
         store_path_set = set()
+        parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
         x_attachments_pdf = []
         for item in attachments:
             store_path = item['store_path']
+            parent_id = item['id']
             if store_path not in db_store_path_set:
                 raise ValueError(f"store_path not found in db: {store_path}")
             if store_path in store_path_set:
                 continue
             store_path_set.add(store_path)
-            new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
+            # new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
+            new_store_path = parent_id_store_path_map[parent_id]
             x_attachments_pdf.append({
                 "store_path": new_store_path,
                 "store_path_txt": "",
@@ -76,11 +79,11 @@ class FlowUpdater:
             return {}
         return {'x_status_list.status_convert.status_txt': 'convert_txt_embedding'}
-    def _step_handle(self, step_stage, status, attachments, db_store_path):
+    def _step_handle(self, step_stage, status, attachments, db_store_path, attachments_pdf):
         method_name = f"_handle_{step_stage}"
         method = getattr(self, method_name, None)
         if method:
-            return method(status, attachments=attachments,
+            return method(status, attachments=attachments, attachments_pdf=attachments_pdf,
                           db_store_path_set=db_store_path) if step_stage == 'convert' else method(status)
         else:
             raise ValueError(f"Unknown step_stage: {step_stage}")
@@ -102,6 +105,7 @@ class FlowUpdater:
         current_stage = op_meta_record['current_stage']
         target_stage = op_meta_record['target_stage']
         attachments = op_meta_record['x_attachments']
+        attachments_pdf = op_meta_record['x_attachments_pdf']
         data_source = op_meta_record['data_source']
         # 校验参数
@@ -148,7 +152,7 @@ class FlowUpdater:
             if step == end_stage and status == 'failed':
                 step_status = False
             logger.info(f' Processing step-{index} {step} - {"successfully" if step_status else "failed"}.')
-            item = self._step_handle(step, step_status, attachments, db_store_path)
+            item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
             update_params.update(item)
         # 执行更新

{orbitkit-0.8.40 → orbitkit-0.8.42/orbitkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.40
+Version: 0.8.42
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao