orbitkit 0.8.30__tar.gz → 0.8.55__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.30/orbitkit.egg-info → orbitkit-0.8.55}/PKG-INFO +16 -6
- orbitkit-0.8.55/orbitkit/VERSION +1 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/airflow_handler/data_preprocessing.py +69 -14
- orbitkit-0.8.55/orbitkit/airflow_handler/file_flow_entry_process.py +274 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/airflow_handler/file_flow_exit_process.py +17 -16
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/airflow_handler/file_handler_v2.py +23 -15
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/audio_transcoder/netmind_extract_v1.py +5 -4
- orbitkit-0.8.55/orbitkit/orbit_type/__init__.py +1 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/orbit_type/orbit_type_simple.py +277 -1
- orbitkit-0.8.55/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +446 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/__init__.py +7 -0
- orbitkit-0.8.55/orbitkit/util/cache_asset_downloader.py +132 -0
- orbitkit-0.8.55/orbitkit/util/universal_extractor.py +525 -0
- orbitkit-0.8.55/orbitkit/util/util_aws_s3_wrapper.py +378 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55/orbitkit.egg-info}/PKG-INFO +16 -6
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit.egg-info/SOURCES.txt +3 -0
- orbitkit-0.8.55/orbitkit.egg-info/requires.txt +8 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/setup.py +13 -4
- orbitkit-0.8.30/orbitkit/VERSION +0 -1
- orbitkit-0.8.30/orbitkit/airflow_handler/file_flow_entry_process.py +0 -160
- orbitkit-0.8.30/orbitkit/orbit_type/__init__.py +0 -1
- orbitkit-0.8.30/orbitkit/util/util_aws_s3_wrapper.py +0 -154
- orbitkit-0.8.30/orbitkit.egg-info/requires.txt +0 -6
- {orbitkit-0.8.30 → orbitkit-0.8.55}/LICENSE +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/MANIFEST.in +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/README.md +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.30 → orbitkit-0.8.55}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.55
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -19,13 +19,22 @@ Classifier: Programming Language :: Python :: 3.4
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.5
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.6
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.7
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
26
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
27
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
29
|
Classifier: Topic :: Software Development :: Libraries
|
|
23
30
|
Description-Content-Type: text/markdown
|
|
24
31
|
License-File: LICENSE
|
|
25
|
-
Requires-Dist: boto3>=1.
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
32
|
+
Requires-Dist: boto3>=1.40.46
|
|
33
|
+
Requires-Dist: aioboto3>=15.5.0
|
|
34
|
+
Requires-Dist: aiofiles>=25.1.0
|
|
35
|
+
Requires-Dist: requests>=2.32.5
|
|
36
|
+
Requires-Dist: prettytable>=3.16.0
|
|
37
|
+
Requires-Dist: pytz>=2025.2
|
|
29
38
|
Requires-Dist: Deprecated
|
|
30
39
|
Requires-Dist: func_timeout
|
|
31
40
|
Dynamic: author
|
|
@@ -35,6 +44,7 @@ Dynamic: description
|
|
|
35
44
|
Dynamic: description-content-type
|
|
36
45
|
Dynamic: home-page
|
|
37
46
|
Dynamic: license
|
|
47
|
+
Dynamic: license-file
|
|
38
48
|
Dynamic: maintainer
|
|
39
49
|
Dynamic: maintainer-email
|
|
40
50
|
Dynamic: platform
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.55
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import datetime
|
|
2
3
|
from collections import defaultdict
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
import googletrans
|
|
3
6
|
|
|
4
7
|
|
|
5
8
|
class DocumentProcessor:
|
|
9
|
+
if version("googletrans") < "4.0.2":
|
|
10
|
+
raise ImportError(f"googletrans >= 4.0.2 is required for async support. {version('googletrans')}")
|
|
6
11
|
AUDIO_SUFFIXES = [".mp3", ".wav", ".aac", ".wma", ".m4a"]
|
|
7
12
|
VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
|
|
8
13
|
PDF_SUFFIXES = [".pdf"]
|
|
9
14
|
DOC_SUFFIXES = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]
|
|
10
|
-
TXT_SUFFIXES = [".txt", ".html", ".htm"]
|
|
15
|
+
TXT_SUFFIXES = [".txt", ".html", ".htm", ".xhtml"]
|
|
11
16
|
ALL_ALLOWED_SUFFIXES = set(AUDIO_SUFFIXES + VIDEO_SUFFIXES + PDF_SUFFIXES + DOC_SUFFIXES + TXT_SUFFIXES)
|
|
12
17
|
|
|
13
18
|
DATA_PROCESS_STEPS = ['convert', 'extract', 'embedding', 'success']
|
|
@@ -16,6 +21,13 @@ class DocumentProcessor:
|
|
|
16
21
|
def get_file_suffix(file_path):
|
|
17
22
|
return f".{file_path.split('.')[-1]}".lower()
|
|
18
23
|
|
|
24
|
+
@staticmethod
|
|
25
|
+
async def translate_text(text, dest='en'):
|
|
26
|
+
"""异步翻译函数 https://pypi.org/project/googletrans/"""
|
|
27
|
+
translator = googletrans.Translator()
|
|
28
|
+
result = await translator.translate(text, dest=dest)
|
|
29
|
+
return result.text
|
|
30
|
+
|
|
19
31
|
@staticmethod
|
|
20
32
|
def create_xbrl_template():
|
|
21
33
|
return {
|
|
@@ -62,9 +74,10 @@ class DocumentProcessor:
|
|
|
62
74
|
def xbrl_type_check(cls, doc):
|
|
63
75
|
is_xbrl = doc.get('x_info_data', {}).get('is_xbrl') == 'true'
|
|
64
76
|
x_attachments = doc.get('x_attachments', [])
|
|
77
|
+
convert_status = doc.get('x_status_list', {}).get('status_convert', {}).get('status')
|
|
65
78
|
xhtml_count = sum(1 for att in x_attachments if att['store_path'].lower().endswith('.xhtml'))
|
|
66
79
|
|
|
67
|
-
if is_xbrl or xhtml_count > 0:
|
|
80
|
+
if is_xbrl or xhtml_count > 0 and convert_status != 'convert_done':
|
|
68
81
|
template = cls.create_xbrl_template()
|
|
69
82
|
template['_id'] = doc['_id']
|
|
70
83
|
template['source_type'] = doc.get('x_report_source', {}).get('source_type', '')
|
|
@@ -79,12 +92,15 @@ class DocumentProcessor:
|
|
|
79
92
|
return False, None
|
|
80
93
|
|
|
81
94
|
@staticmethod
|
|
82
|
-
def get_start_stage_target_stage(doc):
|
|
95
|
+
def get_start_stage_target_stage(doc, custom_process_step_list):
|
|
83
96
|
status_info = doc.get('x_status_list', {}).get('status_convert', {})
|
|
84
97
|
status = status_info.get('status')
|
|
85
98
|
status_txt = status_info.get('status_txt')
|
|
86
99
|
x_spider_name = doc['x_spider_name']
|
|
87
100
|
|
|
101
|
+
if custom_process_step_list:
|
|
102
|
+
return custom_process_step_list[0], custom_process_step_list[1], x_spider_name
|
|
103
|
+
|
|
88
104
|
if status != 'convert_done':
|
|
89
105
|
return 'convert', 'embedding', x_spider_name
|
|
90
106
|
|
|
@@ -97,24 +113,54 @@ class DocumentProcessor:
|
|
|
97
113
|
return 'success', 'success', x_spider_name
|
|
98
114
|
|
|
99
115
|
@staticmethod
|
|
100
|
-
def
|
|
116
|
+
def update_target_stage_by_report_type(doc, target_stage):
|
|
101
117
|
report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
|
|
102
118
|
return "extract" if report_type_ids == ['19999'] else target_stage
|
|
103
119
|
|
|
104
120
|
@staticmethod
|
|
105
|
-
def
|
|
121
|
+
def update_target_stage_by_reported_at(doc, target_stage):
|
|
122
|
+
date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
|
|
123
|
+
try:
|
|
124
|
+
reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
|
125
|
+
except ValueError:
|
|
126
|
+
reported_date = datetime.datetime(1970, 1, 1)
|
|
127
|
+
return "extract" if reported_date < datetime.datetime(2023, 1, 1) else target_stage
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def update_target_stage_by_perm_match(doc, target_stage):
|
|
131
|
+
perm_match_status = doc['x_status_list']['status_perm']['status']
|
|
132
|
+
return target_stage if perm_match_status in {'perm_match_part', 'perm_match'} else "extract"
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
async def create_record(cls, doc, start_stage, important_level):
|
|
106
136
|
attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
|
|
107
137
|
s3_path_info = []
|
|
138
|
+
add_extends = {}
|
|
108
139
|
for att in attachments:
|
|
109
140
|
if len(att['file_name']) > 2000 or len(att['file_name'].encode('utf-8')) > 2000:
|
|
110
141
|
return False, None
|
|
142
|
+
|
|
143
|
+
if start_stage == 'convert' and not add_extends:
|
|
144
|
+
_, ext = os.path.splitext(att['store_path'])
|
|
145
|
+
if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
|
|
146
|
+
add_extends = {
|
|
147
|
+
"original_title": doc['x_orbit_data']['report_title'],
|
|
148
|
+
"title": await cls.translate_text(text=doc['x_orbit_data']['report_title']),
|
|
149
|
+
"published": doc['x_reported_at_utc_date'],
|
|
150
|
+
"tickers": [],
|
|
151
|
+
"perm_id_list": doc['x_orbit_data']['perm_id_list'],
|
|
152
|
+
"report_type_id_list_str": doc['x_orbit_data']['report_type_id_list']
|
|
153
|
+
}
|
|
154
|
+
|
|
111
155
|
s3_path_info.append({
|
|
112
156
|
'store_path': f"s3://{att['bucket']}/{att['store_path']}" if start_stage == 'convert' else att[
|
|
113
157
|
'store_path'],
|
|
114
158
|
'file_name': att['file_name']
|
|
115
159
|
})
|
|
116
|
-
|
|
117
|
-
|
|
160
|
+
result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info, 'important_level': important_level}
|
|
161
|
+
if add_extends:
|
|
162
|
+
result_dict['extends'] = add_extends
|
|
163
|
+
return True, result_dict
|
|
118
164
|
|
|
119
165
|
@staticmethod
|
|
120
166
|
def create_result_info(process_type, message, result_data):
|
|
@@ -125,7 +171,7 @@ class DocumentProcessor:
|
|
|
125
171
|
}
|
|
126
172
|
|
|
127
173
|
@classmethod
|
|
128
|
-
def process(cls, doc):
|
|
174
|
+
async def process(cls, doc, custom_process_step, important_level):
|
|
129
175
|
report_id = doc['_id']
|
|
130
176
|
# 筛选文件
|
|
131
177
|
doc = cls.stock_us_filter_by_is_primary(doc)
|
|
@@ -137,19 +183,28 @@ class DocumentProcessor:
|
|
|
137
183
|
|
|
138
184
|
is_xbrl, xbrl_data = cls.xbrl_type_check(doc)
|
|
139
185
|
if is_xbrl:
|
|
140
|
-
return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
|
|
186
|
+
return cls.create_result_info("xbrl", "XBRL or Xhtml format cannot be processed.", xbrl_data)
|
|
187
|
+
|
|
188
|
+
start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
|
|
141
189
|
|
|
142
|
-
|
|
143
|
-
target_stage
|
|
190
|
+
# 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2023-01-01)
|
|
191
|
+
if target_stage == 'embedding' and not custom_process_step:
|
|
192
|
+
target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
|
|
193
|
+
target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
|
|
194
|
+
target_stage = cls.update_target_stage_by_perm_match(doc, target_stage)
|
|
195
|
+
# 特殊情况下只需要做embedding 但是这个数据被条件限制为只做到提取时状态异常
|
|
196
|
+
if start_stage == 'embedding' and target_stage == 'extract':
|
|
197
|
+
start_stage = 'success'
|
|
198
|
+
target_stage = 'success'
|
|
144
199
|
|
|
145
200
|
if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
|
|
146
201
|
return cls.create_result_info("step_error",
|
|
147
202
|
"Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
|
|
148
|
-
|
|
203
|
+
report_id)
|
|
149
204
|
|
|
150
|
-
file_name_check_status, record = cls.create_record(doc, start_stage)
|
|
205
|
+
file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
|
|
151
206
|
if not file_name_check_status:
|
|
152
|
-
return cls.create_result_info("
|
|
207
|
+
return cls.create_result_info("error", "Document file name too lang.", report_id)
|
|
153
208
|
|
|
154
209
|
return cls.create_result_info("file_flow", "Success", [start_stage, target_stage, x_spider_name, record])
|
|
155
210
|
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections import Counter
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import logging
|
|
6
|
+
import pymongo
|
|
7
|
+
import pytz
|
|
8
|
+
import boto3
|
|
9
|
+
from sqlalchemy import create_engine, Table, MetaData, select
|
|
10
|
+
from sqlalchemy.orm import sessionmaker, scoped_session
|
|
11
|
+
from contextlib import contextmanager
|
|
12
|
+
|
|
13
|
+
from orbitkit.airflow_handler.file_handler_v2 import FileFlowHandleV2
|
|
14
|
+
from orbitkit.airflow_handler.data_preprocessing import DocumentProcessor
|
|
15
|
+
from orbitkit.orbit_type import OrbitTypeMatcher
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FilingOfficialProcessor:
|
|
21
|
+
|
|
22
|
+
def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None, databases_fileflow=None):
|
|
23
|
+
mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
|
|
24
|
+
if not mongo_uri:
|
|
25
|
+
raise KeyError('mongo_uri not set.')
|
|
26
|
+
|
|
27
|
+
if not aws_secret_access_key or not aws_access_key_id:
|
|
28
|
+
raise KeyError('aws_access_key_id and aws_secret_access_key not set.')
|
|
29
|
+
|
|
30
|
+
self.mongo_client = pymongo.MongoClient(mongo_uri)
|
|
31
|
+
self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
|
|
32
|
+
|
|
33
|
+
postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
|
|
34
|
+
if not postgres_uri:
|
|
35
|
+
raise KeyError('postgres_uri not set.')
|
|
36
|
+
databases_fileflow = databases_fileflow or "process_net"
|
|
37
|
+
self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri, database_name=databases_fileflow)
|
|
38
|
+
self.data_processor = DocumentProcessor()
|
|
39
|
+
self.max_batch_size = 10000
|
|
40
|
+
self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
41
|
+
|
|
42
|
+
self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
|
43
|
+
self.matcher = OrbitTypeMatcher(self.s3_client)
|
|
44
|
+
self.report_type_id_name_map = {i["lv3_id"]: i["lv3_name"] for i in self.matcher.get_full_type_list()}
|
|
45
|
+
|
|
46
|
+
self.pi2_postgres_uri = pi2_postgres_uri or os.environ['PG_URI_CX45_USER_GLAUUIADMIN']
|
|
47
|
+
if not self.pi2_postgres_uri:
|
|
48
|
+
raise KeyError('pie_postgres_uri not set.')
|
|
49
|
+
self.databases = pi2_database_name or 'newsfeedsite'
|
|
50
|
+
self.postgres_engine = create_engine(f"{self.pi2_postgres_uri}/{self.databases}", connect_args={"sslmode": "require"})
|
|
51
|
+
self.postgres_session = sessionmaker(bind=self.postgres_engine)
|
|
52
|
+
self.Session = scoped_session(self.postgres_session)
|
|
53
|
+
self.postgres_metadata = MetaData()
|
|
54
|
+
|
|
55
|
+
self.pi2_table = Table(
|
|
56
|
+
'primary_instrument_2_release', self.postgres_metadata,
|
|
57
|
+
autoload_with=self.postgres_engine, schema='security_master'
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
self.postgres_engine2 = create_engine(f"{postgres_uri}/{databases_fileflow}",
|
|
61
|
+
connect_args={"sslmode": "require"})
|
|
62
|
+
self.postgres_session2 = sessionmaker(bind=self.postgres_engine2)
|
|
63
|
+
self.Session2 = scoped_session(self.postgres_session2)
|
|
64
|
+
|
|
65
|
+
self.op_meta = Table(
|
|
66
|
+
'op_meta', self.postgres_metadata,
|
|
67
|
+
autoload_with=self.postgres_engine2, schema='public'
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@contextmanager
|
|
71
|
+
def session_scope(self, use_session=None):
|
|
72
|
+
session = self.Session() if not use_session else use_session
|
|
73
|
+
try:
|
|
74
|
+
yield session
|
|
75
|
+
session.commit()
|
|
76
|
+
except Exception:
|
|
77
|
+
session.rollback()
|
|
78
|
+
raise
|
|
79
|
+
finally:
|
|
80
|
+
self.Session.remove()
|
|
81
|
+
|
|
82
|
+
def create_spider_name_source_type_map(self, collection, label):
|
|
83
|
+
|
|
84
|
+
def find_duplicates(keys):
|
|
85
|
+
return [k for k, v in Counter(keys).items() if v > 1]
|
|
86
|
+
|
|
87
|
+
map_dict = {}
|
|
88
|
+
pipeline = [{'$group': {'_id': "$x_spider_name"}}]
|
|
89
|
+
|
|
90
|
+
for document in collection.aggregate(pipeline):
|
|
91
|
+
map_dict[document['_id']] = label
|
|
92
|
+
|
|
93
|
+
all_keys = list(map_dict.keys())
|
|
94
|
+
duplicates = find_duplicates(all_keys)
|
|
95
|
+
if duplicates:
|
|
96
|
+
raise KeyError(f"Duplicate x_spider_name found: {duplicates}")
|
|
97
|
+
|
|
98
|
+
return map_dict
|
|
99
|
+
|
|
100
|
+
def send_xbrl_data_to_mongo(self, xbrl_data_list):
|
|
101
|
+
if not xbrl_data_list:
|
|
102
|
+
return
|
|
103
|
+
report_id_list = list(set([i['_id'] for i in xbrl_data_list]))
|
|
104
|
+
result = self.data_xbrl_convert_collection.find({'_id': {'$in': report_id_list}}, {'_id': 1}).batch_size(self.max_batch_size)
|
|
105
|
+
exists_id_list = [i['_id'] for i in result]
|
|
106
|
+
new_xbrl_data_list = [i for i in xbrl_data_list if i['_id'] not in exists_id_list]
|
|
107
|
+
if not new_xbrl_data_list:
|
|
108
|
+
return
|
|
109
|
+
self.data_xbrl_convert_collection.insert_many(new_xbrl_data_list)
|
|
110
|
+
logger.info(f'{len(new_xbrl_data_list)}-xbrl data inserted.')
|
|
111
|
+
|
|
112
|
+
def update_doc_status_to_convert(self, collection, report_id_list):
|
|
113
|
+
if len(report_id_list) == 0:
|
|
114
|
+
return
|
|
115
|
+
collection.update_many(
|
|
116
|
+
{"_id": {"$in": report_id_list}},
|
|
117
|
+
{
|
|
118
|
+
"$set": {
|
|
119
|
+
"x_status_list.status_convert.status": "convert_failed",
|
|
120
|
+
"x_status_list.status_convert.status_txt": "convert_txt_init",
|
|
121
|
+
"x_status_list.status_convert.status_meta": "meta_init",
|
|
122
|
+
"x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
|
|
123
|
+
},
|
|
124
|
+
"$unset": {
|
|
125
|
+
"x_attachments_pdf": ""
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
logger.info(f'Unable to convert {len(report_id_list)} document(s) due to unsupported file type.')
|
|
130
|
+
|
|
131
|
+
def update_extends_fields(self, perm_id_list, file_flow_info):
|
|
132
|
+
stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
|
|
133
|
+
orbit_entity_id_ticker_map = {}
|
|
134
|
+
with self.session_scope() as session:
|
|
135
|
+
result = session.execute(stmt)
|
|
136
|
+
for row in result:
|
|
137
|
+
if row.orbit_entity_id not in orbit_entity_id_ticker_map:
|
|
138
|
+
orbit_entity_id_ticker_map[row.orbit_entity_id] = []
|
|
139
|
+
|
|
140
|
+
if row.ticker is not None:
|
|
141
|
+
orbit_entity_id_ticker_map[row.orbit_entity_id].append(row.ticker)
|
|
142
|
+
for step_info, records in file_flow_info.items():
|
|
143
|
+
for record in records:
|
|
144
|
+
if 'extends' in record and record.get('extends') is not None:
|
|
145
|
+
tickers = set()
|
|
146
|
+
for i in record['extends']['perm_id_list']:
|
|
147
|
+
tickers.update(orbit_entity_id_ticker_map.get(i, []))
|
|
148
|
+
record['extends']['tickers'] = list(tickers)
|
|
149
|
+
|
|
150
|
+
record['extends']['report_type_id_list_str'] = [self.report_type_id_name_map.get(i) for i in record['extends']['report_type_id_list_str']]
|
|
151
|
+
|
|
152
|
+
return file_flow_info
|
|
153
|
+
|
|
154
|
+
def send_task(self, file_flow_info, tags, is_important, priority, spider_name_source_type):
|
|
155
|
+
for step_str, records in file_flow_info.items():
|
|
156
|
+
steps = step_str.split('@__@')
|
|
157
|
+
start_stage = steps[0]
|
|
158
|
+
target_stage = steps[1]
|
|
159
|
+
x_spider_name = steps[2]
|
|
160
|
+
|
|
161
|
+
if start_stage == 'success' or target_stage == 'success':
|
|
162
|
+
self.all_stat_count['skip'] += len(records)
|
|
163
|
+
logger.info(
|
|
164
|
+
f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: False, message: 'File has already completed the embedding stage.' ")
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
if is_important:
|
|
168
|
+
logger.info(f"is_important: {is_important} - {x_spider_name}")
|
|
169
|
+
status, ids, message = self.file_handler.entry_point_urgent(records=records, start_stage=start_stage,
|
|
170
|
+
target_stage=target_stage,
|
|
171
|
+
tags=tags,
|
|
172
|
+
tag=x_spider_name,
|
|
173
|
+
priority=priority,
|
|
174
|
+
source_type=spider_name_source_type[
|
|
175
|
+
x_spider_name])
|
|
176
|
+
else:
|
|
177
|
+
status, ids, message = self.file_handler.entry_point(records=records, start_stage=start_stage,
|
|
178
|
+
target_stage=target_stage, tags=tags,tag=x_spider_name,
|
|
179
|
+
priority=priority,
|
|
180
|
+
source_type=spider_name_source_type[x_spider_name])
|
|
181
|
+
self.all_stat_count['file_flow'] += len(records)
|
|
182
|
+
logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
|
|
183
|
+
|
|
184
|
+
def op_meat_deduplicate_docs(self, docs, buffer_size=1000):
|
|
185
|
+
buffer = []
|
|
186
|
+
|
|
187
|
+
for doc in docs:
|
|
188
|
+
buffer.append(doc)
|
|
189
|
+
|
|
190
|
+
if len(buffer) >= buffer_size:
|
|
191
|
+
doc_ids = [d['_id'] for d in buffer]
|
|
192
|
+
with self.session_scope(use_session=self.Session2) as session:
|
|
193
|
+
existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
|
|
194
|
+
existing_ids = {i[0] for i in existing_ids}
|
|
195
|
+
for buffered_doc in buffer:
|
|
196
|
+
self.all_stat_count['all'] += 1
|
|
197
|
+
if buffered_doc['_id'] not in existing_ids:
|
|
198
|
+
yield buffered_doc
|
|
199
|
+
|
|
200
|
+
buffer.clear()
|
|
201
|
+
|
|
202
|
+
if buffer:
|
|
203
|
+
doc_ids = [d['_id'] for d in buffer]
|
|
204
|
+
with self.session_scope(use_session=self.Session2) as session:
|
|
205
|
+
existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
|
|
206
|
+
existing_ids = {i[0] for i in existing_ids}
|
|
207
|
+
for buffered_doc in buffer:
|
|
208
|
+
self.all_stat_count['all'] += 1
|
|
209
|
+
if buffered_doc['_id'] not in existing_ids:
|
|
210
|
+
yield buffered_doc
|
|
211
|
+
|
|
212
|
+
buffer.clear()
|
|
213
|
+
|
|
214
|
+
async def process_task_entry(self, source: str,
|
|
215
|
+
query: dict, tags: list[str], priority: str,
|
|
216
|
+
is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None, db_name: str = None):
|
|
217
|
+
|
|
218
|
+
if not important_level or not isinstance(important_level, int):
|
|
219
|
+
important_level = 0
|
|
220
|
+
|
|
221
|
+
if important_level == 0:
|
|
222
|
+
raise ValueError(f'important_level must be an integer (int) greater than 0. {important_level}')
|
|
223
|
+
|
|
224
|
+
allowed_steps = {"convert", "extract", "embedding"}
|
|
225
|
+
if custom_step is not None:
|
|
226
|
+
if not isinstance(custom_step, list):
|
|
227
|
+
raise ValueError("custom_step must be a list or None.")
|
|
228
|
+
if len(custom_step) > 2:
|
|
229
|
+
raise ValueError("custom_step can contain at most two elements.")
|
|
230
|
+
for step in custom_step:
|
|
231
|
+
if step not in allowed_steps:
|
|
232
|
+
raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
|
|
233
|
+
|
|
234
|
+
collection = self.mongo_client[db_name if db_name else "filing_reports"][source]
|
|
235
|
+
spider_name_source_type = self.create_spider_name_source_type_map(collection, source)
|
|
236
|
+
|
|
237
|
+
process_data = []
|
|
238
|
+
perm_id_set = set()
|
|
239
|
+
logger.info(f"load {source} data.")
|
|
240
|
+
docs = collection.find(query).batch_size(1000)
|
|
241
|
+
duplicate_docs = self.op_meat_deduplicate_docs(docs, buffer_size=self.max_batch_size) if not is_important else docs
|
|
242
|
+
for doc in duplicate_docs:
|
|
243
|
+
for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
|
|
244
|
+
perm_id_set.add(orbit_entity_id)
|
|
245
|
+
result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
|
|
246
|
+
process_data.append(result_record)
|
|
247
|
+
if len(process_data) >= self.max_batch_size:
|
|
248
|
+
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
249
|
+
process_data)
|
|
250
|
+
file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
|
|
251
|
+
self.all_stat_count['doc_error'] += len(doc_error_list)
|
|
252
|
+
self.all_stat_count['step_error'] += len(except_id_list)
|
|
253
|
+
self.all_stat_count['xbrl'] += len(xbrl_data)
|
|
254
|
+
self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
|
|
255
|
+
self.send_xbrl_data_to_mongo(xbrl_data)
|
|
256
|
+
self.update_doc_status_to_convert(collection, doc_error_list)
|
|
257
|
+
process_data.clear()
|
|
258
|
+
perm_id_set.clear()
|
|
259
|
+
|
|
260
|
+
if process_data:
|
|
261
|
+
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
262
|
+
process_data)
|
|
263
|
+
file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
|
|
264
|
+
self.all_stat_count['doc_error'] += len(doc_error_list)
|
|
265
|
+
self.all_stat_count['step_error'] += len(except_id_list)
|
|
266
|
+
self.all_stat_count['xbrl'] += len(xbrl_data)
|
|
267
|
+
self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
|
|
268
|
+
self.send_xbrl_data_to_mongo(xbrl_data)
|
|
269
|
+
self.update_doc_status_to_convert(collection, doc_error_list)
|
|
270
|
+
process_data.clear()
|
|
271
|
+
perm_id_set.clear()
|
|
272
|
+
|
|
273
|
+
logger.info(f"finish processing {self.all_stat_count}. \n")
|
|
274
|
+
self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
@@ -27,9 +27,7 @@ class FlowUpdater:
|
|
|
27
27
|
setattr(self, collection_name, self.coon[data_source])
|
|
28
28
|
return getattr(self, collection_name)
|
|
29
29
|
|
|
30
|
-
def _handle_convert(self, status, attachments, db_store_path_set):
|
|
31
|
-
if not attachments:
|
|
32
|
-
raise ValueError("No attachments provided.")
|
|
30
|
+
def _handle_convert(self, status, attachments, db_store_path_set, attachments_pdf):
|
|
33
31
|
if not status:
|
|
34
32
|
return {
|
|
35
33
|
'x_status_list.status_convert.status': 'convert_failed',
|
|
@@ -37,16 +35,24 @@ class FlowUpdater:
|
|
|
37
35
|
'x_status_list.status_convert.status_meta': 'meta_init'
|
|
38
36
|
}
|
|
39
37
|
|
|
38
|
+
if not attachments or not attachments_pdf:
|
|
39
|
+
raise ValueError("Missing attachments: neither 'attachments' nor 'attachments_pdf' was provided.")
|
|
40
|
+
|
|
40
41
|
store_path_set = set()
|
|
42
|
+
parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
|
|
41
43
|
x_attachments_pdf = []
|
|
42
44
|
for item in attachments:
|
|
43
|
-
|
|
45
|
+
# The video branch will generate store_path_pre, so check for the existence of store_path_pre first.
|
|
46
|
+
# For other branches, the value will be null.
|
|
47
|
+
store_path = item['store_path_pre'] or item['store_path']
|
|
48
|
+
parent_id = item['id']
|
|
44
49
|
if store_path not in db_store_path_set:
|
|
45
50
|
raise ValueError(f"store_path not found in db: {store_path}")
|
|
46
51
|
if store_path in store_path_set:
|
|
47
52
|
continue
|
|
48
53
|
store_path_set.add(store_path)
|
|
49
|
-
new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
|
|
54
|
+
# new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
|
|
55
|
+
new_store_path = parent_id_store_path_map[parent_id]
|
|
50
56
|
x_attachments_pdf.append({
|
|
51
57
|
"store_path": new_store_path,
|
|
52
58
|
"store_path_txt": "",
|
|
@@ -76,11 +82,11 @@ class FlowUpdater:
|
|
|
76
82
|
return {}
|
|
77
83
|
return {'x_status_list.status_convert.status_txt': 'convert_txt_embedding'}
|
|
78
84
|
|
|
79
|
-
def _step_handle(self, step_stage, status, attachments, db_store_path):
|
|
85
|
+
def _step_handle(self, step_stage, status, attachments, db_store_path, attachments_pdf):
|
|
80
86
|
method_name = f"_handle_{step_stage}"
|
|
81
87
|
method = getattr(self, method_name, None)
|
|
82
88
|
if method:
|
|
83
|
-
return method(status, attachments=attachments,
|
|
89
|
+
return method(status, attachments=attachments, attachments_pdf=attachments_pdf,
|
|
84
90
|
db_store_path_set=db_store_path) if step_stage == 'convert' else method(status)
|
|
85
91
|
else:
|
|
86
92
|
raise ValueError(f"Unknown step_stage: {step_stage}")
|
|
@@ -102,16 +108,16 @@ class FlowUpdater:
|
|
|
102
108
|
current_stage = op_meta_record['current_stage']
|
|
103
109
|
target_stage = op_meta_record['target_stage']
|
|
104
110
|
attachments = op_meta_record['x_attachments']
|
|
111
|
+
attachments_pdf = op_meta_record['x_attachments_pdf']
|
|
105
112
|
data_source = op_meta_record['data_source']
|
|
106
113
|
|
|
107
|
-
# 校验参数
|
|
108
114
|
if not report_id or not status or not start_stage or not current_stage or not target_stage or (not attachments and start_stage == 'convert'):
|
|
109
115
|
raise ValueError(f"Invalid op_meta_record: {op_meta_record}")
|
|
110
116
|
if status == 'success' and target_stage != current_stage:
|
|
111
|
-
|
|
117
|
+
logger.error(f"Invalid current_stage: {current_stage}-{report_id}")
|
|
118
|
+
return
|
|
112
119
|
attachments = [i for i in attachments if i['category'] == 'x_attachments']
|
|
113
120
|
|
|
114
|
-
# 确定结束阶段
|
|
115
121
|
end_stage = target_stage if status == 'success' else current_stage if status == 'failed' else None
|
|
116
122
|
if end_stage is None:
|
|
117
123
|
logger.info(f"Invalid status: {status}.")
|
|
@@ -121,11 +127,9 @@ class FlowUpdater:
|
|
|
121
127
|
if start_index > end_index:
|
|
122
128
|
raise ValueError(f"start_stage cannot be after end_stage: {start_stage} -> {end_stage}.")
|
|
123
129
|
|
|
124
|
-
# 开始执行回更逻辑
|
|
125
130
|
logger.info(
|
|
126
131
|
f"😊 _id: {report_id}-{status}, start_step: {self.step_tuple[start_index]}, end_step: {self.step_tuple[end_index]}")
|
|
127
132
|
|
|
128
|
-
# 查询这个报告是否存在于当前数据源
|
|
129
133
|
db_doc = self._check_and_create_collection(data_source).find_one({'_id': report_id},
|
|
130
134
|
{'_id': 1, 'x_attachments': 1,
|
|
131
135
|
'x_status_list': 1})
|
|
@@ -137,20 +141,17 @@ class FlowUpdater:
|
|
|
137
141
|
logger.warning(f"{db_doc['_id']} statxus is not 'crawl_downloaded'")
|
|
138
142
|
return
|
|
139
143
|
|
|
140
|
-
# 构建数据库中存在store_path 防止出现数据库中x_attachments与x_attachments_pdf数据不一致问题
|
|
141
144
|
db_store_path = {f"s3://{i['bucket']}/{i['store_path']}" for i in db_doc['x_attachments']}
|
|
142
145
|
|
|
143
|
-
# 构建更新参数
|
|
144
146
|
update_params = {}
|
|
145
147
|
step_status = True
|
|
146
148
|
for index, step in enumerate(self.step_tuple[start_index:end_index + 1], 1):
|
|
147
149
|
if step == end_stage and status == 'failed':
|
|
148
150
|
step_status = False
|
|
149
151
|
logger.info(f' Processing step-{index} {step} - {"successfully" if step_status else "failed"}.')
|
|
150
|
-
item = self._step_handle(step, step_status, attachments, db_store_path)
|
|
152
|
+
item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
|
|
151
153
|
update_params.update(item)
|
|
152
154
|
|
|
153
|
-
# 执行更新
|
|
154
155
|
if update_params:
|
|
155
156
|
# logger.info(json.dumps(update_params, ensure_ascii=False, indent=2))
|
|
156
157
|
self.update_mongo_data(report_id, data_source, update_params, kafka_ignore)
|