orbitkit 0.8.37__tar.gz → 0.8.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {orbitkit-0.8.37/orbitkit.egg-info → orbitkit-0.8.39}/PKG-INFO +1 -1
  2. orbitkit-0.8.39/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/data_preprocessing.py +34 -5
  4. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_flow_entry_process.py +76 -7
  5. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_handler_v2.py +1 -0
  6. {orbitkit-0.8.37 → orbitkit-0.8.39/orbitkit.egg-info}/PKG-INFO +1 -1
  7. orbitkit-0.8.37/orbitkit/VERSION +0 -1
  8. {orbitkit-0.8.37 → orbitkit-0.8.39}/LICENSE +0 -0
  9. {orbitkit-0.8.37 → orbitkit-0.8.39}/MANIFEST.in +0 -0
  10. {orbitkit-0.8.37 → orbitkit-0.8.39}/README.md +0 -0
  11. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/__init__.py +0 -0
  12. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/__init__.py +0 -0
  13. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/audio_transcoder/__init__.py +0 -0
  16. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  17. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/constant/__init__.py +0 -0
  18. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/constant/report_schema.py +0 -0
  19. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/id_srv/__init__.py +0 -0
  20. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/id_srv/id_gen.py +0 -0
  21. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/id_srv/id_perm_like.py +0 -0
  22. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/lark_send/__init__.py +0 -0
  23. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/lark_send/lark.py +0 -0
  24. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/llm_tools/__init__.py +0 -0
  25. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  26. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/orbit_type/__init__.py +0 -0
  27. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  28. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  29. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/orbit_type/tools.py +0 -0
  30. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_embedding/__init__.py +0 -0
  31. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  32. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  33. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/__init__.py +0 -0
  34. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  35. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/exceptions.py +0 -0
  36. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  37. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  38. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  39. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  40. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  41. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  42. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  43. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  44. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/cache_asset_downloader.py +0 -0
  56. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_aliyun.py +0 -0
  60. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  61. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_aws.py +0 -0
  62. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  63. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_date.py +0 -0
  64. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_html.py +0 -0
  65. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_kafka.py +0 -0
  66. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_md5.py +0 -0
  67. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_selenium.py +0 -0
  68. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_simple_timer.py +0 -0
  69. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_str.py +0 -0
  70. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_type_mapping.py +0 -0
  71. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit/util/util_url.py +0 -0
  72. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit.egg-info/SOURCES.txt +0 -0
  73. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit.egg-info/dependency_links.txt +0 -0
  74. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit.egg-info/not-zip-safe +0 -0
  75. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit.egg-info/requires.txt +0 -0
  76. {orbitkit-0.8.37 → orbitkit-0.8.39}/orbitkit.egg-info/top_level.txt +0 -0
  77. {orbitkit-0.8.37 → orbitkit-0.8.39}/setup.cfg +0 -0
  78. {orbitkit-0.8.37 → orbitkit-0.8.39}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.37
3
+ Version: 0.8.39
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.39
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import datetime
2
3
  from collections import defaultdict
3
4
 
@@ -102,19 +103,43 @@ class DocumentProcessor:
102
103
  return "extract" if report_type_ids == ['19999'] else target_stage
103
104
 
104
105
  @staticmethod
105
- def create_record(doc, start_stage):
106
+ def update_target_stage_for_reported_at(doc, target_stage):
107
+ date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
108
+ try:
109
+ reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
110
+ except ValueError:
111
+ reported_date = datetime.datetime(1970, 1, 1)
112
+ return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
113
+
114
+ @classmethod
115
+ def create_record(cls, doc, start_stage):
106
116
  attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
107
117
  s3_path_info = []
118
+ add_extends = {}
108
119
  for att in attachments:
109
120
  if len(att['file_name']) > 2000 or len(att['file_name'].encode('utf-8')) > 2000:
110
121
  return False, None
122
+
123
+ if start_stage == 'convert' and not add_extends:
124
+ _, ext = os.path.splitext(att['store_path'])
125
+ if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
126
+ add_extends = {
127
+ "title": doc['x_orbit_data']['report_title'],
128
+ "published": doc['x_reported_at_utc_date'],
129
+ "tickers": [],
130
+ "perm_id_list": doc['x_orbit_data']['perm_id_list'],
131
+ "report_type_id_list_str": doc['x_orbit_data']['report_type_id_list']
132
+ }
133
+
111
134
  s3_path_info.append({
112
135
  'store_path': f"s3://{att['bucket']}/{att['store_path']}" if start_stage == 'convert' else att[
113
136
  'store_path'],
114
137
  'file_name': att['file_name']
115
138
  })
116
-
117
- return True, {'id': doc['_id'], 's3_path_info': s3_path_info}
139
+ result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info}
140
+ if add_extends:
141
+ result_dict['extends'] = add_extends
142
+ return True, result_dict
118
143
 
119
144
  @staticmethod
120
145
  def create_result_info(process_type, message, result_data):
@@ -125,7 +150,7 @@ class DocumentProcessor:
125
150
  }
126
151
 
127
152
  @classmethod
128
- def process(cls, doc):
153
+ def process(cls, doc, check_doc):
129
154
  report_id = doc['_id']
130
155
  # 筛选文件
131
156
  doc = cls.stock_us_filter_by_is_primary(doc)
@@ -140,7 +165,11 @@ class DocumentProcessor:
140
165
  return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
141
166
 
142
167
  start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
143
- target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
168
+
169
+ # 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
170
+ if target_stage == 'embedding' and check_doc:
171
+ target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
172
+ target_stage = cls.update_target_stage_for_reported_at(doc, target_stage)
144
173
 
145
174
  if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
146
175
  return cls.create_result_info("step_error",
@@ -5,31 +5,40 @@ from typing import Literal
5
5
  import logging
6
6
  import pymongo
7
7
  import pytz
8
+ import boto3
9
+ from sqlalchemy import create_engine, Table, MetaData, select
10
+ from sqlalchemy.orm import sessionmaker, scoped_session
11
+ from contextlib import contextmanager
12
+
8
13
  from orbitkit.airflow_handler.file_handler_v2 import FileFlowHandleV2
9
14
  from orbitkit.airflow_handler.data_preprocessing import DocumentProcessor
15
+ from orbitkit.orbit_type import OrbitTypeMatcher
10
16
 
11
17
  logger = logging.getLogger(__name__)
12
18
 
13
19
 
14
20
  class FilingOfficialProcessor:
15
21
 
16
- def __init__(self, mongo_uri=None, postgres_uri=None):
22
+ def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None):
17
23
  mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
18
24
  if not mongo_uri:
19
25
  raise KeyError('mongo_uri not set.')
20
26
 
27
+ if not aws_secret_access_key or not aws_access_key_id:
28
+ raise KeyError('aws_access_key_id and aws_secret_access_key not set.')
29
+
21
30
  self.mongo_client = pymongo.MongoClient(mongo_uri)
22
31
  self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
23
32
  self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
24
- self.official_data_collection = self.mongo_client['filing_reports']['official_data_relocation']
33
+ self.filing_reports_astock_test0822_collection = self.mongo_client['filing_reports']['filing_reports_astock_test0822']
25
34
  self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
26
35
  'annotation_reports_view_rows']
27
36
  self.source_map = {
28
37
  'filing_data': (self.filing_data_collection, 'filing_data'),
29
- 'official_data': (self.official_data_collection, 'official_data_relocation'),
38
+ 'filing_reports_astock_test0822': (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822'),
30
39
  'reports_view': [
31
40
  (self.filing_data_collection, 'filing_data'),
32
- (self.official_data_collection, 'official_data_relocation')
41
+ (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822')
33
42
  ]
34
43
  }
35
44
  self.only_low_important_set = {'internal_seekingalpha'}
@@ -41,6 +50,36 @@ class FilingOfficialProcessor:
41
50
  self.max_batch_size = 10000
42
51
  self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
43
52
 
53
+ self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
54
+ self.matcher = OrbitTypeMatcher(self.s3_client)
55
+ self.report_type_id_name_map = {i["lv3_id"]: i["lv3_name"] for i in self.matcher.get_full_type_list()}
56
+
57
+ self.pi2_postgres_uri = pi2_postgres_uri or os.environ['PG_URI_CX45_USER_GLAUUIADMIN']
58
+ if not self.pi2_postgres_uri:
59
+ raise KeyError('pie_postgres_uri not set.')
60
+ self.databases = pi2_database_name or 'newsfeedsite'
61
+ self.postgres_engine = create_engine(f"{self.pi2_postgres_uri}/{self.databases}", connect_args={"sslmode": "require"})
62
+ self.postgres_session = sessionmaker(bind=self.postgres_engine)
63
+ self.Session = scoped_session(self.postgres_session)
64
+ self.postgres_metadata = MetaData()
65
+
66
+ self.pi2_table = Table(
67
+ 'primary_instrument_2_release', self.postgres_metadata,
68
+ autoload_with=self.postgres_engine, schema='security_master'
69
+ )
70
+
71
+ @contextmanager
72
+ def session_scope(self):
73
+ session = self.Session()
74
+ try:
75
+ yield session
76
+ session.commit()
77
+ except Exception:
78
+ session.rollback()
79
+ raise
80
+ finally:
81
+ self.Session.remove()
82
+
44
83
  def create_spider_name_source_type_map(self, collections):
45
84
 
46
85
  def find_duplicates(keys):
@@ -85,6 +124,29 @@ class FilingOfficialProcessor:
85
124
  }})
86
125
  logger.info(f'The document file type cannot be converted.')
87
126
 
127
+ def update_extends_fields(self, perm_id_list, file_flow_info):
128
+ stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
129
+ orbit_entity_id_ticker_map = {}
130
+ with self.session_scope() as session:
131
+ result = session.execute(stmt)
132
+ for row in result:
133
+ if row.orbit_entity_id not in orbit_entity_id_ticker_map:
134
+ orbit_entity_id_ticker_map[row.orbit_entity_id] = []
135
+
136
+ if row.ticker is not None:
137
+ orbit_entity_id_ticker_map[row.orbit_entity_id].append(row.ticker)
138
+ for step_info, records in file_flow_info.items():
139
+ for record in records:
140
+ if 'extends' in record and record.get('extends') is not None:
141
+ tickers = []
142
+ for i in record['extends']['perm_id_list']:
143
+ tickers.extend(orbit_entity_id_ticker_map.get(i, []))
144
+ record['extends']['tickers'] = tickers
145
+
146
+ record['extends']['report_type_id_list_str'] = [self.report_type_id_name_map.get(i) for i in record['extends']['report_type_id_list_str']]
147
+
148
+ return file_flow_info
149
+
88
150
  def send_task(self, file_flow_info, tags, is_important, priority, spider_name_source_type):
89
151
  for step_str, records in file_flow_info.items():
90
152
  steps = step_str.split('@__@')
@@ -116,9 +178,9 @@ class FilingOfficialProcessor:
116
178
  logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
117
179
 
118
180
 
119
- def process_task_entry(self, source: Literal["filing_data", "official_data", "reports_view"],
181
+ def process_task_entry(self, source: Literal["filing_data", "filing_reports_astock_test0822", "reports_view"],
120
182
  query: dict, tags: list[str], priority: str,
121
- is_important: bool = False):
183
+ is_important: bool = False, check_doc: bool = True):
122
184
 
123
185
  if source == 'reports_view':
124
186
  collections = self.source_map[source]
@@ -128,16 +190,20 @@ class FilingOfficialProcessor:
128
190
  spider_name_source_type = self.create_spider_name_source_type_map(collections)
129
191
 
130
192
  process_data = []
193
+ perm_id_set = set()
131
194
  for collection, label in collections:
132
195
  logger.info(f"load {label} data.")
133
196
  docs = collection.find(query).batch_size(1000)
134
197
 
135
198
  for doc in docs:
136
199
  self.all_stat_count['all'] += 1
137
- process_data.append(self.data_processor.process(doc))
200
+ for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
201
+ perm_id_set.add(orbit_entity_id)
202
+ process_data.append(self.data_processor.process(doc, check_doc))
138
203
  if len(process_data) >= self.max_batch_size:
139
204
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
140
205
  process_data)
206
+ file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
141
207
  self.all_stat_count['skip'] += len(doc_error_list)
142
208
  self.all_stat_count['step_error'] += len(except_id_list)
143
209
  self.all_stat_count['xbrl'] += len(xbrl_data)
@@ -145,10 +211,12 @@ class FilingOfficialProcessor:
145
211
  self.send_xbrl_data_to_mongo(xbrl_data)
146
212
  self.update_doc_status_to_convert(collection, doc_error_list)
147
213
  process_data.clear()
214
+ perm_id_set.clear()
148
215
 
149
216
  if process_data:
150
217
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
151
218
  process_data)
219
+ file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
152
220
  self.all_stat_count['skip'] += len(doc_error_list)
153
221
  self.all_stat_count['step_error'] += len(except_id_list)
154
222
  self.all_stat_count['xbrl'] += len(xbrl_data)
@@ -156,5 +224,6 @@ class FilingOfficialProcessor:
156
224
  self.send_xbrl_data_to_mongo(xbrl_data)
157
225
  self.update_doc_status_to_convert(collection, doc_error_list)
158
226
  process_data.clear()
227
+ perm_id_set.clear()
159
228
 
160
229
  logger.info(f"finish processing {self.all_stat_count}.")
@@ -132,6 +132,7 @@ class FileFlowHandleV2:
132
132
  'current_stage': params['current_stage'],
133
133
  'target_stage': params['target_stage'],
134
134
  'data_source': params['source_type'],
135
+ 'extends': record.get('extends', {}),
135
136
  'created_at': now,
136
137
  'updated_at': now,
137
138
  'tags': params['tags'],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.37
3
+ Version: 0.8.39
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.37
File without changes
File without changes
File without changes
File without changes
File without changes