PyPI - orbitkit - Versions diffs - 0.8.48__tar.gz → 0.8.49__tar.gz - Mend

orbitkit 0.8.48tar.gz → 0.8.49tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{orbitkit-0.8.48/orbitkit.egg-info → orbitkit-0.8.49}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.48
+Version: 0.8.49
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao
@@ -37,19 +37,6 @@ Requires-Dist: prettytable>=3.17.0
 Requires-Dist: pytz>=2025.2
 Requires-Dist: Deprecated
 Requires-Dist: func_timeout
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: license
-Dynamic: license-file
-Dynamic: maintainer
-Dynamic: maintainer-email
-Dynamic: platform
-Dynamic: requires-dist
-Dynamic: summary
 # orbitkit

orbitkit-0.8.49/orbitkit/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.8.49

{orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/data_preprocessing.py RENAMED Viewed

@@ -12,7 +12,7 @@ class DocumentProcessor:
     VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
     PDF_SUFFIXES = [".pdf"]
     DOC_SUFFIXES = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]
-    TXT_SUFFIXES = [".txt", ".html", ".htm"]
+    TXT_SUFFIXES = [".txt", ".html", ".htm", ".xhtml"]
     ALL_ALLOWED_SUFFIXES = set(AUDIO_SUFFIXES + VIDEO_SUFFIXES + PDF_SUFFIXES + DOC_SUFFIXES + TXT_SUFFIXES)
     DATA_PROCESS_STEPS = ['convert', 'extract', 'embedding', 'success']
@@ -74,9 +74,10 @@ class DocumentProcessor:
     def xbrl_type_check(cls, doc):
         is_xbrl = doc.get('x_info_data', {}).get('is_xbrl') == 'true'
         x_attachments = doc.get('x_attachments', [])
+        convert_status = doc.get('x_status_list', {}).get('status_convert', {}).get('status')
         xhtml_count = sum(1 for att in x_attachments if att['store_path'].lower().endswith('.xhtml'))
-        if is_xbrl or xhtml_count > 0:
+        if is_xbrl or xhtml_count > 0 and convert_status != 'convert_done':
             template = cls.create_xbrl_template()
             template['_id'] = doc['_id']
             template['source_type'] = doc.get('x_report_source', {}).get('source_type', '')
@@ -123,7 +124,7 @@ class DocumentProcessor:
             reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
         except ValueError:
             reported_date = datetime.datetime(1970, 1, 1)
-        return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
+        return "extract" if reported_date < datetime.datetime(2023, 1, 1) else target_stage
     @classmethod
     async def create_record(cls, doc, start_stage, important_level):
@@ -177,7 +178,7 @@ class DocumentProcessor:
         is_xbrl, xbrl_data = cls.xbrl_type_check(doc)
         if is_xbrl:
-            return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
+            return cls.create_result_info("xbrl", "XBRL or Xhtml format cannot be processed.", xbrl_data)
         start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
@@ -185,7 +186,7 @@ class DocumentProcessor:
         if target_stage == 'embedding' and not custom_process_step:
             target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
             target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
-            # 特殊情况下只需要做提取但是这个数据被过滤不需要做embedding
+            # 特殊情况下只需要做embedding 但是这个数据被条件限制为只做到提取时状态异常
             if start_stage == 'embedding' and target_stage == 'extract':
                 start_stage = 'success'
                 target_stage = 'success'
@@ -193,11 +194,11 @@ class DocumentProcessor:
         if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
             return cls.create_result_info("step_error",
                                           "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
-                                          doc['_id'])
+                                          report_id)
         file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
         if not file_name_check_status:
-            return cls.create_result_info("step_error", "Document file name too lang.", report_id)
+            return cls.create_result_info("error", "Document file name too lang.", report_id)
         return cls.create_result_info("file_flow", "Success", [start_stage, target_stage, x_spider_name, record])

{orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/file_flow_entry_process.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 from collections import Counter
 from datetime import datetime
-from typing import Literal, Optional
+from typing import Optional
 import logging
 import pymongo
 import pytz
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
 class FilingOfficialProcessor:
-    def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None):
+    def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None, databases_fileflow=None):
         mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
         if not mongo_uri:
             raise KeyError('mongo_uri not set.')
@@ -29,23 +29,15 @@ class FilingOfficialProcessor:
         self.mongo_client = pymongo.MongoClient(mongo_uri)
         self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
-        self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
-        self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
-            'annotation_reports_view_rows']
-        self.source_map = {
-            'filing_data': (self.filing_data_collection, 'filing_data'),
-            'G7_demo': (self.filing_data_collection, 'G7_demo'),
-            'reports_view': [
-                (self.filing_data_collection, 'filing_data')
-            ]
-        }
         postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
         if not postgres_uri:
             raise KeyError('postgres_uri not set.')
-        self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri)
+        databases_fileflow = databases_fileflow or "process_net"
+        self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri, database_name=databases_fileflow)
         self.data_processor = DocumentProcessor()
         self.max_batch_size = 10000
-        self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
+        self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
         self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
         self.matcher = OrbitTypeMatcher(self.s3_client)
@@ -65,9 +57,19 @@ class FilingOfficialProcessor:
             autoload_with=self.postgres_engine, schema='security_master'
         )
+        self.postgres_engine2 = create_engine(f"{postgres_uri}/{databases_fileflow}",
+                                             connect_args={"sslmode": "require"})
+        self.postgres_session2 = sessionmaker(bind=self.postgres_engine2)
+        self.Session2 = scoped_session(self.postgres_session2)
+        self.op_meta = Table(
+            'op_meta', self.postgres_metadata,
+            autoload_with=self.postgres_engine2, schema='public'
+        )
     @contextmanager
-    def session_scope(self):
-        session = self.Session()
+    def session_scope(self, use_session=None):
+        session = self.Session() if not use_session else use_session
         try:
             yield session
             session.commit()
@@ -77,7 +79,7 @@ class FilingOfficialProcessor:
         finally:
             self.Session.remove()
-    def create_spider_name_source_type_map(self, collections):
+    def create_spider_name_source_type_map(self, collection, label):
         def find_duplicates(keys):
             return [k for k, v in Counter(keys).items() if v > 1]
@@ -85,9 +87,8 @@ class FilingOfficialProcessor:
         map_dict = {}
         pipeline = [{'$group': {'_id': "$x_spider_name"}}]
-        for collection, label in collections:
-            for document in collection.aggregate(pipeline):
-                map_dict[document['_id']] = label
+        for document in collection.aggregate(pipeline):
+            map_dict[document['_id']] = label
         all_keys = list(map_dict.keys())
         duplicates = find_duplicates(all_keys)
@@ -174,10 +175,39 @@ class FilingOfficialProcessor:
             self.all_stat_count['file_flow'] += len(records)
             logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
+    def op_meat_deduplicate_docs(self, docs, buffer_size=1000):
+        buffer = []
+        for doc in docs:
+            buffer.append(doc)
+            if len(buffer) >= buffer_size:
+                doc_ids = [d['_id'] for d in buffer]
+                with self.session_scope(use_session=self.Session2) as session:
+                    existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
+                    existing_ids = {i[0] for i in existing_ids}
+                for buffered_doc in buffer:
+                    self.all_stat_count['all'] += 1
+                    if buffered_doc['_id'] not in existing_ids:
+                        yield buffered_doc
+                buffer.clear()
+        if buffer:
+            doc_ids = [d['_id'] for d in buffer]
+            with self.session_scope(use_session=self.Session2) as session:
+                existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
+                existing_ids = {i[0] for i in existing_ids}
+            for buffered_doc in buffer:
+                self.all_stat_count['all'] += 1
+                if buffered_doc['_id'] not in existing_ids:
+                    yield buffered_doc
+            buffer.clear()
-    async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
+    async def process_task_entry(self, source: str,
                            query: dict, tags: list[str], priority: str,
-                           is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None):
+                           is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None, db_name: str = None):
         if not important_level or not isinstance(important_level, int):
             important_level = 0
@@ -195,43 +225,24 @@ class FilingOfficialProcessor:
                 if step not in allowed_steps:
                     raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
-        if source == 'reports_view':
-            collections = self.source_map[source]
-        else:
-            collections = [self.source_map[source]]
-        spider_name_source_type = self.create_spider_name_source_type_map(collections)
+        collection = self.mongo_client[db_name if db_name else "filing_reports"][source]
+        spider_name_source_type = self.create_spider_name_source_type_map(collection, source)
         process_data = []
         perm_id_set = set()
-        for collection, label in collections:
-            logger.info(f"load {label} data.")
-            docs = collection.find(query).batch_size(1000)
-            for doc in docs:
-                self.all_stat_count['all'] += 1
-                for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
-                    perm_id_set.add(orbit_entity_id)
-                result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
-                process_data.append(result_record)
-                if len(process_data) >= self.max_batch_size:
-                    file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
-                        process_data)
-                    file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
-                    self.all_stat_count['skip'] += len(doc_error_list)
-                    self.all_stat_count['step_error'] += len(except_id_list)
-                    self.all_stat_count['xbrl'] += len(xbrl_data)
-                    self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
-                    self.send_xbrl_data_to_mongo(xbrl_data)
-                    self.update_doc_status_to_convert(collection, doc_error_list)
-                    process_data.clear()
-                    perm_id_set.clear()
-            if process_data:
+        logger.info(f"load {source} data.")
+        docs = collection.find(query).batch_size(1000)
+        duplicate_docs = self.op_meat_deduplicate_docs(docs, buffer_size=self.max_batch_size) if not is_important else docs
+        for doc in duplicate_docs:
+            for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
+                perm_id_set.add(orbit_entity_id)
+            result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
+            process_data.append(result_record)
+            if len(process_data) >= self.max_batch_size:
                 file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
                     process_data)
                 file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
-                self.all_stat_count['skip'] += len(doc_error_list)
+                self.all_stat_count['doc_error'] += len(doc_error_list)
                 self.all_stat_count['step_error'] += len(except_id_list)
                 self.all_stat_count['xbrl'] += len(xbrl_data)
                 self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
@@ -240,5 +251,18 @@ class FilingOfficialProcessor:
                 process_data.clear()
                 perm_id_set.clear()
+        if process_data:
+            file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
+                process_data)
+            file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
+            self.all_stat_count['doc_error'] += len(doc_error_list)
+            self.all_stat_count['step_error'] += len(except_id_list)
+            self.all_stat_count['xbrl'] += len(xbrl_data)
+            self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
+            self.send_xbrl_data_to_mongo(xbrl_data)
+            self.update_doc_status_to_convert(collection, doc_error_list)
+            process_data.clear()
+            perm_id_set.clear()
         logger.info(f"finish processing {self.all_stat_count}. \n")
-        self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
+        self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}

{orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/file_handler_v2.py RENAMED Viewed

@@ -44,7 +44,7 @@ class FileFlowHandleV2:
             autoload_with=self.postgres_engine, schema='public'
         )
-        self.not_allow_file_type_list = not_allow_file_type_list or ['.xhtml']
+        self.not_allow_file_type_list = not_allow_file_type_list or []
     @contextmanager
     def session_scope(self):
@@ -94,20 +94,23 @@ class FileFlowHandleV2:
             result = session.execute(stmt)
             return [row[0] for row in result.fetchall()]
-    def _check_records(self, records: List[Dict[str, Any]], clean_exist_data: bool) -> Tuple[bool, List[str], str]:
+    def _check_records(self, records: List[Dict[str, Any]], clean_exist_data: bool) -> Tuple[bool, List[str], str, List[str]]:
         ids = []
+        invalidate_ids = []
         record_count = 0
         for record in records:
             record_count += 1
             is_valid, msg = self._validate_record(record)
             if not is_valid:
-                return False, [record.get("id", "unknown")], msg
+                invalidate_ids.append(record.get('id', 'unknown'))
+                logger.error(f"Validation failed for record {record.get('id', 'unknown')}: {msg}")
+                continue
             ids.append(record["id"])
         existing_ids = self._get_existing_ids(ids)
         if not clean_exist_data and len(existing_ids) == record_count:
-            return False, existing_ids, "No new data has been inserted."
-        return True, existing_ids, f"Validation complete. total: {len(records)}. {len(existing_ids)} records already exist."
+            return False, existing_ids, "No new data has been inserted.", invalidate_ids
+        return True, existing_ids, f"Validation complete. total: {len(records)}. {len(existing_ids)} records already exist.", invalidate_ids
     def _build_insert_data(
             self, record: Dict[str, Any], params: Dict[str, Any]
@@ -324,7 +327,7 @@ class FileFlowHandleV2:
         elif not isinstance(records, list):
             raise ValueError("records must be a dict or list of dicts.")
-        is_valid, existing_ids, msg = self._check_records(records, clean_exist_data)
+        is_valid, existing_ids, msg, invalidate_ids = self._check_records(records, clean_exist_data)
         if not is_valid:
             return False, existing_ids, msg
         logger.info(msg)
@@ -334,6 +337,8 @@ class FileFlowHandleV2:
         exist_data_ids = set()
         count = 0
         for record in records:
+            if record['id'] in invalidate_ids:
+                continue
             count += 1
             if record['id'] in existing_ids:
                 if clean_exist_data:

{orbitkit-0.8.48 → orbitkit-0.8.49/orbitkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.48
+Version: 0.8.49
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao
@@ -37,19 +37,6 @@ Requires-Dist: prettytable>=3.17.0
 Requires-Dist: pytz>=2025.2
 Requires-Dist: Deprecated
 Requires-Dist: func_timeout
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: license
-Dynamic: license-file
-Dynamic: maintainer
-Dynamic: maintainer-email
-Dynamic: platform
-Dynamic: requires-dist
-Dynamic: summary
 # orbitkit