orbitkit 0.8.19__tar.gz → 0.8.55__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {orbitkit-0.8.19/orbitkit.egg-info → orbitkit-0.8.55}/PKG-INFO +16 -6
  2. orbitkit-0.8.55/orbitkit/VERSION +1 -0
  3. orbitkit-0.8.55/orbitkit/airflow_handler/data_preprocessing.py +234 -0
  4. orbitkit-0.8.55/orbitkit/airflow_handler/file_flow_entry_process.py +274 -0
  5. orbitkit-0.8.55/orbitkit/airflow_handler/file_flow_exit_process.py +157 -0
  6. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/airflow_handler/file_handler.py +1 -20
  7. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/airflow_handler/file_handler_v2.py +69 -53
  8. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/audio_transcoder/netmind_extract_v1.py +5 -4
  9. orbitkit-0.8.55/orbitkit/orbit_type/__init__.py +1 -0
  10. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/orbit_type/orbit_type_simple.py +277 -1
  11. orbitkit-0.8.55/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +259 -0
  12. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +23 -6
  13. orbitkit-0.8.55/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +446 -0
  14. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/__init__.py +7 -0
  15. orbitkit-0.8.55/orbitkit/util/cache_asset_downloader.py +132 -0
  16. orbitkit-0.8.55/orbitkit/util/universal_extractor.py +525 -0
  17. orbitkit-0.8.55/orbitkit/util/util_aws_s3_wrapper.py +378 -0
  18. {orbitkit-0.8.19 → orbitkit-0.8.55/orbitkit.egg-info}/PKG-INFO +16 -6
  19. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit.egg-info/SOURCES.txt +7 -0
  20. orbitkit-0.8.55/orbitkit.egg-info/requires.txt +8 -0
  21. {orbitkit-0.8.19 → orbitkit-0.8.55}/setup.py +13 -4
  22. orbitkit-0.8.19/orbitkit/VERSION +0 -1
  23. orbitkit-0.8.19/orbitkit/orbit_type/__init__.py +0 -1
  24. orbitkit-0.8.19/orbitkit/util/util_aws_s3_wrapper.py +0 -154
  25. orbitkit-0.8.19/orbitkit.egg-info/requires.txt +0 -6
  26. {orbitkit-0.8.19 → orbitkit-0.8.55}/LICENSE +0 -0
  27. {orbitkit-0.8.19 → orbitkit-0.8.55}/MANIFEST.in +0 -0
  28. {orbitkit-0.8.19 → orbitkit-0.8.55}/README.md +0 -0
  29. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/__init__.py +0 -0
  30. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/airflow_handler/__init__.py +0 -0
  31. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/audio_transcoder/__init__.py +0 -0
  32. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/constant/__init__.py +0 -0
  33. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/constant/report_schema.py +0 -0
  34. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/id_srv/__init__.py +0 -0
  35. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/id_srv/id_gen.py +0 -0
  36. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/id_srv/id_perm_like.py +0 -0
  37. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/lark_send/__init__.py +0 -0
  38. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/lark_send/lark.py +0 -0
  39. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/llm_tools/__init__.py +0 -0
  40. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  41. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  42. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/orbit_type/tools.py +0 -0
  43. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_embedding/__init__.py +0 -0
  44. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  45. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  46. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/__init__.py +0 -0
  47. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  48. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/exceptions.py +0 -0
  49. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  50. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  51. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  52. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  53. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  54. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  55. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  56. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/base.py +0 -0
  57. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  58. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/core.py +0 -0
  59. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  60. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  61. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  62. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_writer/__init__.py +0 -0
  63. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  64. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/common.py +0 -0
  65. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/customize_regix_manager.py +0 -0
  66. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/secret_manager.py +0 -0
  67. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_aliyun.py +0 -0
  68. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  69. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_aws.py +0 -0
  70. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_date.py +0 -0
  71. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_html.py +0 -0
  72. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_kafka.py +0 -0
  73. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_md5.py +0 -0
  74. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_selenium.py +0 -0
  75. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_simple_timer.py +0 -0
  76. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_str.py +0 -0
  77. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_type_mapping.py +0 -0
  78. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit/util/util_url.py +0 -0
  79. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit.egg-info/dependency_links.txt +0 -0
  80. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit.egg-info/not-zip-safe +0 -0
  81. {orbitkit-0.8.19 → orbitkit-0.8.55}/orbitkit.egg-info/top_level.txt +0 -0
  82. {orbitkit-0.8.19 → orbitkit-0.8.55}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.19
3
+ Version: 0.8.55
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -19,13 +19,22 @@ Classifier: Programming Language :: Python :: 3.4
19
19
  Classifier: Programming Language :: Python :: 3.5
20
20
  Classifier: Programming Language :: Python :: 3.6
21
21
  Classifier: Programming Language :: Python :: 3.7
22
+ Classifier: Programming Language :: Python :: 3.8
23
+ Classifier: Programming Language :: Python :: 3.9
24
+ Classifier: Programming Language :: Python :: 3.10
25
+ Classifier: Programming Language :: Python :: 3.11
26
+ Classifier: Programming Language :: Python :: 3.12
27
+ Classifier: Programming Language :: Python :: 3.13
28
+ Classifier: Programming Language :: Python :: 3.14
22
29
  Classifier: Topic :: Software Development :: Libraries
23
30
  Description-Content-Type: text/markdown
24
31
  License-File: LICENSE
25
- Requires-Dist: boto3>=1.16.0
26
- Requires-Dist: requests>=2.12.1
27
- Requires-Dist: prettytable>=3.0.0
28
- Requires-Dist: pytz>=2022.1
32
+ Requires-Dist: boto3>=1.40.46
33
+ Requires-Dist: aioboto3>=15.5.0
34
+ Requires-Dist: aiofiles>=25.1.0
35
+ Requires-Dist: requests>=2.32.5
36
+ Requires-Dist: prettytable>=3.16.0
37
+ Requires-Dist: pytz>=2025.2
29
38
  Requires-Dist: Deprecated
30
39
  Requires-Dist: func_timeout
31
40
  Dynamic: author
@@ -35,6 +44,7 @@ Dynamic: description
35
44
  Dynamic: description-content-type
36
45
  Dynamic: home-page
37
46
  Dynamic: license
47
+ Dynamic: license-file
38
48
  Dynamic: maintainer
39
49
  Dynamic: maintainer-email
40
50
  Dynamic: platform
@@ -0,0 +1 @@
1
+ 0.8.55
@@ -0,0 +1,234 @@
1
+ import os
2
+ import datetime
3
+ from collections import defaultdict
4
+ from importlib.metadata import version
5
+ import googletrans
6
+
7
+
8
+ class DocumentProcessor:
9
+ if version("googletrans") < "4.0.2":
10
+ raise ImportError(f"googletrans >= 4.0.2 is required for async support. {version('googletrans')}")
11
+ AUDIO_SUFFIXES = [".mp3", ".wav", ".aac", ".wma", ".m4a"]
12
+ VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
13
+ PDF_SUFFIXES = [".pdf"]
14
+ DOC_SUFFIXES = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]
15
+ TXT_SUFFIXES = [".txt", ".html", ".htm", ".xhtml"]
16
+ ALL_ALLOWED_SUFFIXES = set(AUDIO_SUFFIXES + VIDEO_SUFFIXES + PDF_SUFFIXES + DOC_SUFFIXES + TXT_SUFFIXES)
17
+
18
+ DATA_PROCESS_STEPS = ['convert', 'extract', 'embedding', 'success']
19
+
20
+ @staticmethod
21
+ def get_file_suffix(file_path):
22
+ return f".{file_path.split('.')[-1]}".lower()
23
+
24
+ @staticmethod
25
+ async def translate_text(text, dest='en'):
26
+ """异步翻译函数 https://pypi.org/project/googletrans/"""
27
+ translator = googletrans.Translator()
28
+ result = await translator.translate(text, dest=dest)
29
+ return result.text
30
+
31
+ @staticmethod
32
+ def create_xbrl_template():
33
+ return {
34
+ "_id": "",
35
+ "source_type": "",
36
+ "x_attachments": [],
37
+ "status": "init",
38
+ "logs": [],
39
+ "metadata": {},
40
+ "x_created_date": datetime.datetime.now(),
41
+ "x_updated_date": datetime.datetime.now(),
42
+ }
43
+
44
+ @classmethod
45
+ def stock_us_filter_by_is_primary(cls, doc):
46
+ if not doc:
47
+ return None
48
+ if doc.get('x_spider_name') != 'stock_us':
49
+ return doc
50
+ doc['x_attachments'] = [att for att in doc.get('x_attachments', []) if 'is_primary' in att]
51
+ if len(doc['x_attachments']) == 0:
52
+ return None
53
+ return doc
54
+
55
+ @classmethod
56
+ def stock_indiabse_filter_by_prefix(cls, doc):
57
+ if not doc:
58
+ return None
59
+ if doc.get('x_spider_name') != 'stock_indiabse':
60
+ return doc
61
+ doc['x_attachments'] = [att for att in doc.get('x_attachments', []) if att['file_type'].lower() != '.xml']
62
+ if len(doc['x_attachments']) == 0:
63
+ return None
64
+ return doc
65
+
66
+ @classmethod
67
+ def file_type_filter(cls, doc):
68
+ if not doc:
69
+ return None
70
+ suffixes = {cls.get_file_suffix(att['store_path']) for att in doc.get('x_attachments', [])}
71
+ return doc if suffixes.issubset(cls.ALL_ALLOWED_SUFFIXES) else None
72
+
73
+ @classmethod
74
+ def xbrl_type_check(cls, doc):
75
+ is_xbrl = doc.get('x_info_data', {}).get('is_xbrl') == 'true'
76
+ x_attachments = doc.get('x_attachments', [])
77
+ convert_status = doc.get('x_status_list', {}).get('status_convert', {}).get('status')
78
+ xhtml_count = sum(1 for att in x_attachments if att['store_path'].lower().endswith('.xhtml'))
79
+
80
+ if is_xbrl or xhtml_count > 0 and convert_status != 'convert_done':
81
+ template = cls.create_xbrl_template()
82
+ template['_id'] = doc['_id']
83
+ template['source_type'] = doc.get('x_report_source', {}).get('source_type', '')
84
+ template['x_attachments'] = [{
85
+ "file_hash": att['file_hash'],
86
+ "store_path": f"s3://{att['bucket']}/{att['store_path']}",
87
+ "store_path_converted_pdf": "",
88
+ "store_path_converted_pdf_image": "",
89
+ } for att in x_attachments]
90
+ return True, template
91
+
92
+ return False, None
93
+
94
+ @staticmethod
95
+ def get_start_stage_target_stage(doc, custom_process_step_list):
96
+ status_info = doc.get('x_status_list', {}).get('status_convert', {})
97
+ status = status_info.get('status')
98
+ status_txt = status_info.get('status_txt')
99
+ x_spider_name = doc['x_spider_name']
100
+
101
+ if custom_process_step_list:
102
+ return custom_process_step_list[0], custom_process_step_list[1], x_spider_name
103
+
104
+ if status != 'convert_done':
105
+ return 'convert', 'embedding', x_spider_name
106
+
107
+ if status_txt not in ['convert_txt_done', 'convert_txt_embedding']:
108
+ return 'extract', 'embedding', x_spider_name
109
+
110
+ if status_txt == 'convert_txt_done':
111
+ return 'embedding', 'embedding', x_spider_name
112
+
113
+ return 'success', 'success', x_spider_name
114
+
115
+ @staticmethod
116
+ def update_target_stage_by_report_type(doc, target_stage):
117
+ report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
118
+ return "extract" if report_type_ids == ['19999'] else target_stage
119
+
120
+ @staticmethod
121
+ def update_target_stage_by_reported_at(doc, target_stage):
122
+ date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
123
+ try:
124
+ reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
125
+ except ValueError:
126
+ reported_date = datetime.datetime(1970, 1, 1)
127
+ return "extract" if reported_date < datetime.datetime(2023, 1, 1) else target_stage
128
+
129
+ @staticmethod
130
+ def update_target_stage_by_perm_match(doc, target_stage):
131
+ perm_match_status = doc['x_status_list']['status_perm']['status']
132
+ return target_stage if perm_match_status in {'perm_match_part', 'perm_match'} else "extract"
133
+
134
+ @classmethod
135
+ async def create_record(cls, doc, start_stage, important_level):
136
+ attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
137
+ s3_path_info = []
138
+ add_extends = {}
139
+ for att in attachments:
140
+ if len(att['file_name']) > 2000 or len(att['file_name'].encode('utf-8')) > 2000:
141
+ return False, None
142
+
143
+ if start_stage == 'convert' and not add_extends:
144
+ _, ext = os.path.splitext(att['store_path'])
145
+ if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
146
+ add_extends = {
147
+ "original_title": doc['x_orbit_data']['report_title'],
148
+ "title": await cls.translate_text(text=doc['x_orbit_data']['report_title']),
149
+ "published": doc['x_reported_at_utc_date'],
150
+ "tickers": [],
151
+ "perm_id_list": doc['x_orbit_data']['perm_id_list'],
152
+ "report_type_id_list_str": doc['x_orbit_data']['report_type_id_list']
153
+ }
154
+
155
+ s3_path_info.append({
156
+ 'store_path': f"s3://{att['bucket']}/{att['store_path']}" if start_stage == 'convert' else att[
157
+ 'store_path'],
158
+ 'file_name': att['file_name']
159
+ })
160
+ result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info, 'important_level': important_level}
161
+ if add_extends:
162
+ result_dict['extends'] = add_extends
163
+ return True, result_dict
164
+
165
+ @staticmethod
166
+ def create_result_info(process_type, message, result_data):
167
+ return {
168
+ 'process_type': process_type,
169
+ 'message': message,
170
+ 'result_data': result_data
171
+ }
172
+
173
+ @classmethod
174
+ async def process(cls, doc, custom_process_step, important_level):
175
+ report_id = doc['_id']
176
+ # 筛选文件
177
+ doc = cls.stock_us_filter_by_is_primary(doc)
178
+ doc = cls.stock_indiabse_filter_by_prefix(doc)
179
+ # 校验文件类型必须在筛选文件类型之后
180
+ doc = cls.file_type_filter(doc)
181
+ if doc is None:
182
+ return cls.create_result_info("error", "Document file type is not allowed.", report_id)
183
+
184
+ is_xbrl, xbrl_data = cls.xbrl_type_check(doc)
185
+ if is_xbrl:
186
+ return cls.create_result_info("xbrl", "XBRL or Xhtml format cannot be processed.", xbrl_data)
187
+
188
+ start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
189
+
190
+ # 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2023-01-01)
191
+ if target_stage == 'embedding' and not custom_process_step:
192
+ target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
193
+ target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
194
+ target_stage = cls.update_target_stage_by_perm_match(doc, target_stage)
195
+ # 特殊情况下只需要做embedding 但是这个数据被条件限制为只做到提取时状态异常
196
+ if start_stage == 'embedding' and target_stage == 'extract':
197
+ start_stage = 'success'
198
+ target_stage = 'success'
199
+
200
+ if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
201
+ return cls.create_result_info("step_error",
202
+ "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
203
+ report_id)
204
+
205
+ file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
206
+ if not file_name_check_status:
207
+ return cls.create_result_info("error", "Document file name too lang.", report_id)
208
+
209
+ return cls.create_result_info("file_flow", "Success", [start_stage, target_stage, x_spider_name, record])
210
+
211
+ @classmethod
212
+ def split_data_by_spider_name_and_step(cls, process_records):
213
+ file_flow_info = defaultdict(list)
214
+ xbrl_data_list = []
215
+ except_id_list = []
216
+ doc_error_list = []
217
+
218
+ for item in process_records:
219
+ process_type = item.get('process_type')
220
+ if process_type == 'xbrl':
221
+ xbrl_data_list.append(item['result_data'])
222
+ elif process_type == 'file_flow':
223
+ start_stage, target_stage, x_spider_name, record = item['result_data']
224
+ key = f"{start_stage}@__@{target_stage}@__@{x_spider_name}"
225
+ file_flow_info[key].append(record)
226
+ elif process_type == 'step_error':
227
+ except_id_list.append(item['result_data'])
228
+ elif process_type == 'error':
229
+ doc_error_list.append(item['result_data'])
230
+ else:
231
+ raise KeyError(
232
+ f"Unknown process_type: {process_type}. Expected one of ['xbrl', 'file_flow', 'step_error', 'error'].")
233
+
234
+ return file_flow_info, xbrl_data_list, except_id_list, doc_error_list
@@ -0,0 +1,274 @@
1
+ import os
2
+ from collections import Counter
3
+ from datetime import datetime
4
+ from typing import Optional
5
+ import logging
6
+ import pymongo
7
+ import pytz
8
+ import boto3
9
+ from sqlalchemy import create_engine, Table, MetaData, select
10
+ from sqlalchemy.orm import sessionmaker, scoped_session
11
+ from contextlib import contextmanager
12
+
13
+ from orbitkit.airflow_handler.file_handler_v2 import FileFlowHandleV2
14
+ from orbitkit.airflow_handler.data_preprocessing import DocumentProcessor
15
+ from orbitkit.orbit_type import OrbitTypeMatcher
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class FilingOfficialProcessor:
21
+
22
+ def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None, databases_fileflow=None):
23
+ mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
24
+ if not mongo_uri:
25
+ raise KeyError('mongo_uri not set.')
26
+
27
+ if not aws_secret_access_key or not aws_access_key_id:
28
+ raise KeyError('aws_access_key_id and aws_secret_access_key not set.')
29
+
30
+ self.mongo_client = pymongo.MongoClient(mongo_uri)
31
+ self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
32
+
33
+ postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
34
+ if not postgres_uri:
35
+ raise KeyError('postgres_uri not set.')
36
+ databases_fileflow = databases_fileflow or "process_net"
37
+ self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri, database_name=databases_fileflow)
38
+ self.data_processor = DocumentProcessor()
39
+ self.max_batch_size = 10000
40
+ self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
41
+
42
+ self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
43
+ self.matcher = OrbitTypeMatcher(self.s3_client)
44
+ self.report_type_id_name_map = {i["lv3_id"]: i["lv3_name"] for i in self.matcher.get_full_type_list()}
45
+
46
+ self.pi2_postgres_uri = pi2_postgres_uri or os.environ['PG_URI_CX45_USER_GLAUUIADMIN']
47
+ if not self.pi2_postgres_uri:
48
+ raise KeyError('pie_postgres_uri not set.')
49
+ self.databases = pi2_database_name or 'newsfeedsite'
50
+ self.postgres_engine = create_engine(f"{self.pi2_postgres_uri}/{self.databases}", connect_args={"sslmode": "require"})
51
+ self.postgres_session = sessionmaker(bind=self.postgres_engine)
52
+ self.Session = scoped_session(self.postgres_session)
53
+ self.postgres_metadata = MetaData()
54
+
55
+ self.pi2_table = Table(
56
+ 'primary_instrument_2_release', self.postgres_metadata,
57
+ autoload_with=self.postgres_engine, schema='security_master'
58
+ )
59
+
60
+ self.postgres_engine2 = create_engine(f"{postgres_uri}/{databases_fileflow}",
61
+ connect_args={"sslmode": "require"})
62
+ self.postgres_session2 = sessionmaker(bind=self.postgres_engine2)
63
+ self.Session2 = scoped_session(self.postgres_session2)
64
+
65
+ self.op_meta = Table(
66
+ 'op_meta', self.postgres_metadata,
67
+ autoload_with=self.postgres_engine2, schema='public'
68
+ )
69
+
70
+ @contextmanager
71
+ def session_scope(self, use_session=None):
72
+ session = self.Session() if not use_session else use_session
73
+ try:
74
+ yield session
75
+ session.commit()
76
+ except Exception:
77
+ session.rollback()
78
+ raise
79
+ finally:
80
+ self.Session.remove()
81
+
82
+ def create_spider_name_source_type_map(self, collection, label):
83
+
84
+ def find_duplicates(keys):
85
+ return [k for k, v in Counter(keys).items() if v > 1]
86
+
87
+ map_dict = {}
88
+ pipeline = [{'$group': {'_id': "$x_spider_name"}}]
89
+
90
+ for document in collection.aggregate(pipeline):
91
+ map_dict[document['_id']] = label
92
+
93
+ all_keys = list(map_dict.keys())
94
+ duplicates = find_duplicates(all_keys)
95
+ if duplicates:
96
+ raise KeyError(f"Duplicate x_spider_name found: {duplicates}")
97
+
98
+ return map_dict
99
+
100
+ def send_xbrl_data_to_mongo(self, xbrl_data_list):
101
+ if not xbrl_data_list:
102
+ return
103
+ report_id_list = list(set([i['_id'] for i in xbrl_data_list]))
104
+ result = self.data_xbrl_convert_collection.find({'_id': {'$in': report_id_list}}, {'_id': 1}).batch_size(self.max_batch_size)
105
+ exists_id_list = [i['_id'] for i in result]
106
+ new_xbrl_data_list = [i for i in xbrl_data_list if i['_id'] not in exists_id_list]
107
+ if not new_xbrl_data_list:
108
+ return
109
+ self.data_xbrl_convert_collection.insert_many(new_xbrl_data_list)
110
+ logger.info(f'{len(new_xbrl_data_list)}-xbrl data inserted.')
111
+
112
+ def update_doc_status_to_convert(self, collection, report_id_list):
113
+ if len(report_id_list) == 0:
114
+ return
115
+ collection.update_many(
116
+ {"_id": {"$in": report_id_list}},
117
+ {
118
+ "$set": {
119
+ "x_status_list.status_convert.status": "convert_failed",
120
+ "x_status_list.status_convert.status_txt": "convert_txt_init",
121
+ "x_status_list.status_convert.status_meta": "meta_init",
122
+ "x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
123
+ },
124
+ "$unset": {
125
+ "x_attachments_pdf": ""
126
+ }
127
+ }
128
+ )
129
+ logger.info(f'Unable to convert {len(report_id_list)} document(s) due to unsupported file type.')
130
+
131
+ def update_extends_fields(self, perm_id_list, file_flow_info):
132
+ stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
133
+ orbit_entity_id_ticker_map = {}
134
+ with self.session_scope() as session:
135
+ result = session.execute(stmt)
136
+ for row in result:
137
+ if row.orbit_entity_id not in orbit_entity_id_ticker_map:
138
+ orbit_entity_id_ticker_map[row.orbit_entity_id] = []
139
+
140
+ if row.ticker is not None:
141
+ orbit_entity_id_ticker_map[row.orbit_entity_id].append(row.ticker)
142
+ for step_info, records in file_flow_info.items():
143
+ for record in records:
144
+ if 'extends' in record and record.get('extends') is not None:
145
+ tickers = set()
146
+ for i in record['extends']['perm_id_list']:
147
+ tickers.update(orbit_entity_id_ticker_map.get(i, []))
148
+ record['extends']['tickers'] = list(tickers)
149
+
150
+ record['extends']['report_type_id_list_str'] = [self.report_type_id_name_map.get(i) for i in record['extends']['report_type_id_list_str']]
151
+
152
+ return file_flow_info
153
+
154
+ def send_task(self, file_flow_info, tags, is_important, priority, spider_name_source_type):
155
+ for step_str, records in file_flow_info.items():
156
+ steps = step_str.split('@__@')
157
+ start_stage = steps[0]
158
+ target_stage = steps[1]
159
+ x_spider_name = steps[2]
160
+
161
+ if start_stage == 'success' or target_stage == 'success':
162
+ self.all_stat_count['skip'] += len(records)
163
+ logger.info(
164
+ f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: False, message: 'File has already completed the embedding stage.' ")
165
+ continue
166
+
167
+ if is_important:
168
+ logger.info(f"is_important: {is_important} - {x_spider_name}")
169
+ status, ids, message = self.file_handler.entry_point_urgent(records=records, start_stage=start_stage,
170
+ target_stage=target_stage,
171
+ tags=tags,
172
+ tag=x_spider_name,
173
+ priority=priority,
174
+ source_type=spider_name_source_type[
175
+ x_spider_name])
176
+ else:
177
+ status, ids, message = self.file_handler.entry_point(records=records, start_stage=start_stage,
178
+ target_stage=target_stage, tags=tags,tag=x_spider_name,
179
+ priority=priority,
180
+ source_type=spider_name_source_type[x_spider_name])
181
+ self.all_stat_count['file_flow'] += len(records)
182
+ logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
183
+
184
+ def op_meat_deduplicate_docs(self, docs, buffer_size=1000):
185
+ buffer = []
186
+
187
+ for doc in docs:
188
+ buffer.append(doc)
189
+
190
+ if len(buffer) >= buffer_size:
191
+ doc_ids = [d['_id'] for d in buffer]
192
+ with self.session_scope(use_session=self.Session2) as session:
193
+ existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
194
+ existing_ids = {i[0] for i in existing_ids}
195
+ for buffered_doc in buffer:
196
+ self.all_stat_count['all'] += 1
197
+ if buffered_doc['_id'] not in existing_ids:
198
+ yield buffered_doc
199
+
200
+ buffer.clear()
201
+
202
+ if buffer:
203
+ doc_ids = [d['_id'] for d in buffer]
204
+ with self.session_scope(use_session=self.Session2) as session:
205
+ existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
206
+ existing_ids = {i[0] for i in existing_ids}
207
+ for buffered_doc in buffer:
208
+ self.all_stat_count['all'] += 1
209
+ if buffered_doc['_id'] not in existing_ids:
210
+ yield buffered_doc
211
+
212
+ buffer.clear()
213
+
214
+ async def process_task_entry(self, source: str,
215
+ query: dict, tags: list[str], priority: str,
216
+ is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None, db_name: str = None):
217
+
218
+ if not important_level or not isinstance(important_level, int):
219
+ important_level = 0
220
+
221
+ if important_level == 0:
222
+ raise ValueError(f'important_level must be an integer (int) greater than 0. {important_level}')
223
+
224
+ allowed_steps = {"convert", "extract", "embedding"}
225
+ if custom_step is not None:
226
+ if not isinstance(custom_step, list):
227
+ raise ValueError("custom_step must be a list or None.")
228
+ if len(custom_step) > 2:
229
+ raise ValueError("custom_step can contain at most two elements.")
230
+ for step in custom_step:
231
+ if step not in allowed_steps:
232
+ raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
233
+
234
+ collection = self.mongo_client[db_name if db_name else "filing_reports"][source]
235
+ spider_name_source_type = self.create_spider_name_source_type_map(collection, source)
236
+
237
+ process_data = []
238
+ perm_id_set = set()
239
+ logger.info(f"load {source} data.")
240
+ docs = collection.find(query).batch_size(1000)
241
+ duplicate_docs = self.op_meat_deduplicate_docs(docs, buffer_size=self.max_batch_size) if not is_important else docs
242
+ for doc in duplicate_docs:
243
+ for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
244
+ perm_id_set.add(orbit_entity_id)
245
+ result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
246
+ process_data.append(result_record)
247
+ if len(process_data) >= self.max_batch_size:
248
+ file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
249
+ process_data)
250
+ file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
251
+ self.all_stat_count['doc_error'] += len(doc_error_list)
252
+ self.all_stat_count['step_error'] += len(except_id_list)
253
+ self.all_stat_count['xbrl'] += len(xbrl_data)
254
+ self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
255
+ self.send_xbrl_data_to_mongo(xbrl_data)
256
+ self.update_doc_status_to_convert(collection, doc_error_list)
257
+ process_data.clear()
258
+ perm_id_set.clear()
259
+
260
+ if process_data:
261
+ file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
262
+ process_data)
263
+ file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
264
+ self.all_stat_count['doc_error'] += len(doc_error_list)
265
+ self.all_stat_count['step_error'] += len(except_id_list)
266
+ self.all_stat_count['xbrl'] += len(xbrl_data)
267
+ self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
268
+ self.send_xbrl_data_to_mongo(xbrl_data)
269
+ self.update_doc_status_to_convert(collection, doc_error_list)
270
+ process_data.clear()
271
+ perm_id_set.clear()
272
+
273
+ logger.info(f"finish processing {self.all_stat_count}. \n")
274
+ self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}