PyPI - orbitkit - Versions diffs - 0.8.44__tar.gz → 0.8.46__tar.gz - Mend

orbitkit 0.8.44tar.gz → 0.8.46tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{orbitkit-0.8.44/orbitkit.egg-info → orbitkit-0.8.46}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.44
+Version: 0.8.46
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao

orbitkit-0.8.46/orbitkit/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.8.46

{orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/data_preprocessing.py RENAMED Viewed

@@ -126,7 +126,7 @@ class DocumentProcessor:
         return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
     @classmethod
-    async def create_record(cls, doc, start_stage):
+    async def create_record(cls, doc, start_stage, important_level):
         attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
         s3_path_info = []
         add_extends = {}
@@ -151,7 +151,7 @@ class DocumentProcessor:
                     'store_path'],
                 'file_name': att['file_name']
             })
-        result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info}
+        result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info, 'important_level': important_level}
         if add_extends:
             result_dict['extends'] = add_extends
         return True, result_dict
@@ -165,7 +165,7 @@ class DocumentProcessor:
         }
     @classmethod
-    async def process(cls, doc, custom_process_step):
+    async def process(cls, doc, custom_process_step, important_level):
         report_id = doc['_id']
         # 筛选文件
         doc = cls.stock_us_filter_by_is_primary(doc)
@@ -195,7 +195,7 @@ class DocumentProcessor:
                                           "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
                                           doc['_id'])
-        file_name_check_status, record = await cls.create_record(doc, start_stage)
+        file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
         if not file_name_check_status:
             return cls.create_result_info("step_error", "Document file name too lang.", report_id)

{orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_flow_entry_process.py RENAMED Viewed

@@ -177,7 +177,13 @@ class FilingOfficialProcessor:
     async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
                            query: dict, tags: list[str], priority: str,
-                           is_important: bool = False, custom_step: Optional[list[str]] = None):
+                           is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None):
+        if not important_level or not isinstance(important_level, int):
+            important_level = 0
+        if important_level == 0:
+            raise ValueError(f'important_level must be an integer (int) greater than 0. {important_level}')
         allowed_steps = {"convert", "extract", "embedding"}
         if custom_step is not None:
@@ -206,7 +212,7 @@ class FilingOfficialProcessor:
                 self.all_stat_count['all'] += 1
                 for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
                     perm_id_set.add(orbit_entity_id)
-                result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step)
+                result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
                 process_data.append(result_record)
                 if len(process_data) >= self.max_batch_size:
                     file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(

{orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_flow_exit_process.py RENAMED Viewed

@@ -42,7 +42,9 @@ class FlowUpdater:
         parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
         x_attachments_pdf = []
         for item in attachments:
-            store_path = item['store_path']
+            # The video branch will generate store_path_pre, so check for the existence of store_path_pre first.
+            # For other branches, the value will be null.
+            store_path = item['store_path_pre'] or item['store_path']
             parent_id = item['id']
             if store_path not in db_store_path_set:
                 raise ValueError(f"store_path not found in db: {store_path}")
@@ -109,7 +111,6 @@ class FlowUpdater:
         attachments_pdf = op_meta_record['x_attachments_pdf']
         data_source = op_meta_record['data_source']
-        # 校验参数
         if not report_id or not status or not start_stage or not current_stage or not target_stage or (not attachments and start_stage == 'convert'):
             raise ValueError(f"Invalid op_meta_record: {op_meta_record}")
         if status == 'success' and target_stage != current_stage:
@@ -117,7 +118,6 @@ class FlowUpdater:
             return
         attachments = [i for i in attachments if i['category'] == 'x_attachments']
-        # 确定结束阶段
         end_stage = target_stage if status == 'success' else current_stage if status == 'failed' else None
         if end_stage is None:
             logger.info(f"Invalid status: {status}.")
@@ -127,11 +127,9 @@ class FlowUpdater:
         if start_index > end_index:
             raise ValueError(f"start_stage cannot be after end_stage: {start_stage} -> {end_stage}.")
-        # 开始执行回更逻辑
         logger.info(
             f"😊 _id: {report_id}-{status}, start_step: {self.step_tuple[start_index]}, end_step: {self.step_tuple[end_index]}")
-        # 查询这个报告是否存在于当前数据源
         db_doc = self._check_and_create_collection(data_source).find_one({'_id': report_id},
                                                                          {'_id': 1, 'x_attachments': 1,
                                                                           'x_status_list': 1})
@@ -143,10 +141,8 @@ class FlowUpdater:
             logger.warning(f"{db_doc['_id']} statxus is not 'crawl_downloaded'")
             return
-        # 构建数据库中存在store_path 防止出现数据库中x_attachments与x_attachments_pdf数据不一致问题
         db_store_path = {f"s3://{i['bucket']}/{i['store_path']}" for i in db_doc['x_attachments']}
-        # 构建更新参数
         update_params = {}
         step_status = True
         for index, step in enumerate(self.step_tuple[start_index:end_index + 1], 1):
@@ -156,7 +152,6 @@ class FlowUpdater:
             item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
             update_params.update(item)
-        # 执行更新
         if update_params:
             # logger.info(json.dumps(update_params, ensure_ascii=False, indent=2))
             self.update_mongo_data(report_id, data_source, update_params, kafka_ignore)

{orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_handler_v2.py RENAMED Viewed

@@ -137,7 +137,8 @@ class FileFlowHandleV2:
             'created_at': now,
             'updated_at': now,
             'tags': params['tags'],
-            'tag': params['tag']
+            'tag': params['tag'],
+            'important_level': record.get('important_level', 0)
         }
         step = {

{orbitkit-0.8.44 → orbitkit-0.8.46/orbitkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.44
+Version: 0.8.46
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao