PyPI - orbitkit - Versions diffs - 0.8.37__tar.gz → 0.8.39__tar.gz - Mend

orbitkit 0.8.37tar.gz → 0.8.39tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{orbitkit-0.8.37/orbitkit.egg-info → orbitkit-0.8.39}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.37
+Version: 0.8.39
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao

orbitkit-0.8.39/orbitkit/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.8.39

{orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/data_preprocessing.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import os
 import datetime
 from collections import defaultdict
@@ -102,19 +103,43 @@ class DocumentProcessor:
         return "extract" if report_type_ids == ['19999'] else target_stage
     @staticmethod
-    def create_record(doc, start_stage):
+    def update_target_stage_for_reported_at(doc, target_stage):
+        date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
+        try:
+            reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
+        except ValueError:
+            reported_date = datetime.datetime(1970, 1, 1)
+        return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
+    @classmethod
+    def create_record(cls, doc, start_stage):
         attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
         s3_path_info = []
+        add_extends = {}
         for att in attachments:
             if len(att['file_name']) > 2000 or len(att['file_name'].encode('utf-8')) > 2000:
                 return False, None
+            if start_stage == 'convert' and not add_extends:
+                _, ext = os.path.splitext(att['store_path'])
+                if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
+                    add_extends = {
+                        "title": doc['x_orbit_data']['report_title'],
+                        "published": doc['x_reported_at_utc_date'],
+                        "tickers": [],
+                        "perm_id_list": doc['x_orbit_data']['perm_id_list'],
+                        "report_type_id_list_str": doc['x_orbit_data']['report_type_id_list']
+                    }
             s3_path_info.append({
                 'store_path': f"s3://{att['bucket']}/{att['store_path']}" if start_stage == 'convert' else att[
                     'store_path'],
                 'file_name': att['file_name']
             })
-        return True, {'id': doc['_id'], 's3_path_info': s3_path_info}
+        result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info}
+        if add_extends:
+            result_dict['extends'] = add_extends
+        return True, result_dict
     @staticmethod
     def create_result_info(process_type, message, result_data):
@@ -125,7 +150,7 @@ class DocumentProcessor:
         }
     @classmethod
-    def process(cls, doc):
+    def process(cls, doc, check_doc):
         report_id = doc['_id']
         # 筛选文件
         doc = cls.stock_us_filter_by_is_primary(doc)
@@ -140,7 +165,11 @@ class DocumentProcessor:
             return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
         start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
-        target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
+        # 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
+        if target_stage == 'embedding' and check_doc:
+            target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
+            target_stage = cls.update_target_stage_for_reported_at(doc, target_stage)
         if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
             return cls.create_result_info("step_error",

{orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_flow_entry_process.py RENAMED Viewed

@@ -5,31 +5,40 @@ from typing import Literal
 import logging
 import pymongo
 import pytz
+import boto3
+from sqlalchemy import create_engine, Table, MetaData, select
+from sqlalchemy.orm import sessionmaker, scoped_session
+from contextlib import contextmanager
 from orbitkit.airflow_handler.file_handler_v2 import FileFlowHandleV2
 from orbitkit.airflow_handler.data_preprocessing import DocumentProcessor
+from orbitkit.orbit_type import OrbitTypeMatcher
 logger = logging.getLogger(__name__)
 class FilingOfficialProcessor:
-    def __init__(self, mongo_uri=None, postgres_uri=None):
+    def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None):
         mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
         if not mongo_uri:
             raise KeyError('mongo_uri not set.')
+        if not aws_secret_access_key or not aws_access_key_id:
+            raise KeyError('aws_access_key_id and aws_secret_access_key not set.')
         self.mongo_client = pymongo.MongoClient(mongo_uri)
         self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
         self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
-        self.official_data_collection = self.mongo_client['filing_reports']['official_data_relocation']
+        self.filing_reports_astock_test0822_collection = self.mongo_client['filing_reports']['filing_reports_astock_test0822']
         self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
             'annotation_reports_view_rows']
         self.source_map = {
             'filing_data': (self.filing_data_collection, 'filing_data'),
-            'official_data': (self.official_data_collection, 'official_data_relocation'),
+            'filing_reports_astock_test0822': (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822'),
             'reports_view': [
                 (self.filing_data_collection, 'filing_data'),
-                (self.official_data_collection, 'official_data_relocation')
+                (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822')
             ]
         }
         self.only_low_important_set = {'internal_seekingalpha'}
@@ -41,6 +50,36 @@ class FilingOfficialProcessor:
         self.max_batch_size = 10000
         self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
+        self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
+        self.matcher = OrbitTypeMatcher(self.s3_client)
+        self.report_type_id_name_map = {i["lv3_id"]: i["lv3_name"] for i in self.matcher.get_full_type_list()}
+        self.pi2_postgres_uri = pi2_postgres_uri or os.environ['PG_URI_CX45_USER_GLAUUIADMIN']
+        if not self.pi2_postgres_uri:
+            raise KeyError('pie_postgres_uri not set.')
+        self.databases = pi2_database_name or 'newsfeedsite'
+        self.postgres_engine = create_engine(f"{self.pi2_postgres_uri}/{self.databases}", connect_args={"sslmode": "require"})
+        self.postgres_session = sessionmaker(bind=self.postgres_engine)
+        self.Session = scoped_session(self.postgres_session)
+        self.postgres_metadata = MetaData()
+        self.pi2_table = Table(
+            'primary_instrument_2_release', self.postgres_metadata,
+            autoload_with=self.postgres_engine, schema='security_master'
+        )
+    @contextmanager
+    def session_scope(self):
+        session = self.Session()
+        try:
+            yield session
+            session.commit()
+        except Exception:
+            session.rollback()
+            raise
+        finally:
+            self.Session.remove()
     def create_spider_name_source_type_map(self, collections):
         def find_duplicates(keys):
@@ -85,6 +124,29 @@ class FilingOfficialProcessor:
         }})
         logger.info(f'The document file type cannot be converted.')
+    def update_extends_fields(self, perm_id_list, file_flow_info):
+        stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
+        orbit_entity_id_ticker_map = {}
+        with self.session_scope() as session:
+            result = session.execute(stmt)
+            for row in result:
+                if row.orbit_entity_id not in orbit_entity_id_ticker_map:
+                    orbit_entity_id_ticker_map[row.orbit_entity_id] = []
+                if row.ticker is not None:
+                    orbit_entity_id_ticker_map[row.orbit_entity_id].append(row.ticker)
+        for step_info, records in file_flow_info.items():
+            for record in records:
+                if 'extends' in record and record.get('extends') is not None:
+                    tickers = []
+                    for i in record['extends']['perm_id_list']:
+                        tickers.extend(orbit_entity_id_ticker_map.get(i, []))
+                    record['extends']['tickers'] = tickers
+                    record['extends']['report_type_id_list_str'] = [self.report_type_id_name_map.get(i) for i in record['extends']['report_type_id_list_str']]
+        return file_flow_info
     def send_task(self, file_flow_info, tags, is_important, priority, spider_name_source_type):
         for step_str, records in file_flow_info.items():
             steps = step_str.split('@__@')
@@ -116,9 +178,9 @@ class FilingOfficialProcessor:
             logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
-    def process_task_entry(self, source: Literal["filing_data", "official_data", "reports_view"],
+    def process_task_entry(self, source: Literal["filing_data", "filing_reports_astock_test0822", "reports_view"],
                            query: dict, tags: list[str], priority: str,
-                           is_important: bool = False):
+                           is_important: bool = False, check_doc: bool = True):
         if source == 'reports_view':
             collections = self.source_map[source]
@@ -128,16 +190,20 @@ class FilingOfficialProcessor:
         spider_name_source_type = self.create_spider_name_source_type_map(collections)
         process_data = []
+        perm_id_set = set()
         for collection, label in collections:
             logger.info(f"load {label} data.")
             docs = collection.find(query).batch_size(1000)
             for doc in docs:
                 self.all_stat_count['all'] += 1
-                process_data.append(self.data_processor.process(doc))
+                for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
+                    perm_id_set.add(orbit_entity_id)
+                process_data.append(self.data_processor.process(doc, check_doc))
                 if len(process_data) >= self.max_batch_size:
                     file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
                         process_data)
+                    file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
                     self.all_stat_count['skip'] += len(doc_error_list)
                     self.all_stat_count['step_error'] += len(except_id_list)
                     self.all_stat_count['xbrl'] += len(xbrl_data)
@@ -145,10 +211,12 @@ class FilingOfficialProcessor:
                     self.send_xbrl_data_to_mongo(xbrl_data)
                     self.update_doc_status_to_convert(collection, doc_error_list)
                     process_data.clear()
+                    perm_id_set.clear()
             if process_data:
                 file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
                     process_data)
+                file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
                 self.all_stat_count['skip'] += len(doc_error_list)
                 self.all_stat_count['step_error'] += len(except_id_list)
                 self.all_stat_count['xbrl'] += len(xbrl_data)
@@ -156,5 +224,6 @@ class FilingOfficialProcessor:
                 self.send_xbrl_data_to_mongo(xbrl_data)
                 self.update_doc_status_to_convert(collection, doc_error_list)
                 process_data.clear()
+                perm_id_set.clear()
         logger.info(f"finish processing {self.all_stat_count}.")

{orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_handler_v2.py RENAMED Viewed

@@ -132,6 +132,7 @@ class FileFlowHandleV2:
             'current_stage': params['current_stage'],
             'target_stage': params['target_stage'],
             'data_source': params['source_type'],
+            'extends': record.get('extends', {}),
             'created_at': now,
             'updated_at': now,
             'tags': params['tags'],

{orbitkit-0.8.37 → orbitkit-0.8.39/orbitkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: orbitkit
-Version: 0.8.37
+Version: 0.8.39
 Summary: This project is only for Orbit Tech internal use.
 Home-page: https://github.com/clown-0726/orbitkit
 Author: Lilu Cao