orbitkit 0.8.41__tar.gz → 0.8.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.41/orbitkit.egg-info → orbitkit-0.8.43}/PKG-INFO +1 -1
- orbitkit-0.8.43/orbitkit/VERSION +1 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/data_preprocessing.py +27 -17
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_flow_entry_process.py +24 -15
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_flow_exit_process.py +11 -7
- {orbitkit-0.8.41 → orbitkit-0.8.43/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.41/orbitkit/VERSION +0 -1
- {orbitkit-0.8.41 → orbitkit-0.8.43}/LICENSE +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/MANIFEST.in +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/README.md +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/setup.cfg +0 -0
- {orbitkit-0.8.41 → orbitkit-0.8.43}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.43
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import datetime
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
import googletrans
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class DocumentProcessor:
|
|
8
|
-
|
|
9
|
+
if version("googletrans") < "4.0.2":
|
|
10
|
+
raise ImportError(f"googletrans >= 4.0.2 is required for async support. {version('googletrans')}")
|
|
9
11
|
AUDIO_SUFFIXES = [".mp3", ".wav", ".aac", ".wma", ".m4a"]
|
|
10
12
|
VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
|
|
11
13
|
PDF_SUFFIXES = [".pdf"]
|
|
@@ -20,9 +22,10 @@ class DocumentProcessor:
|
|
|
20
22
|
return f".{file_path.split('.')[-1]}".lower()
|
|
21
23
|
|
|
22
24
|
@staticmethod
|
|
23
|
-
def translate_text(text, dest='en'):
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
async def translate_text(text, dest='en'):
|
|
26
|
+
"""异步翻译函数 https://pypi.org/project/googletrans/"""
|
|
27
|
+
translator = googletrans.Translator()
|
|
28
|
+
result = await translator.translate(text, dest=dest)
|
|
26
29
|
return result.text
|
|
27
30
|
|
|
28
31
|
@staticmethod
|
|
@@ -88,12 +91,15 @@ class DocumentProcessor:
|
|
|
88
91
|
return False, None
|
|
89
92
|
|
|
90
93
|
@staticmethod
|
|
91
|
-
def get_start_stage_target_stage(doc):
|
|
94
|
+
def get_start_stage_target_stage(doc, custom_process_step_list):
|
|
92
95
|
status_info = doc.get('x_status_list', {}).get('status_convert', {})
|
|
93
96
|
status = status_info.get('status')
|
|
94
97
|
status_txt = status_info.get('status_txt')
|
|
95
98
|
x_spider_name = doc['x_spider_name']
|
|
96
99
|
|
|
100
|
+
if custom_process_step_list:
|
|
101
|
+
return custom_process_step_list[0], custom_process_step_list[1], x_spider_name
|
|
102
|
+
|
|
97
103
|
if status != 'convert_done':
|
|
98
104
|
return 'convert', 'embedding', x_spider_name
|
|
99
105
|
|
|
@@ -106,12 +112,12 @@ class DocumentProcessor:
|
|
|
106
112
|
return 'success', 'success', x_spider_name
|
|
107
113
|
|
|
108
114
|
@staticmethod
|
|
109
|
-
def
|
|
115
|
+
def update_target_stage_by_report_type(doc, target_stage):
|
|
110
116
|
report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
|
|
111
117
|
return "extract" if report_type_ids == ['19999'] else target_stage
|
|
112
118
|
|
|
113
119
|
@staticmethod
|
|
114
|
-
def
|
|
120
|
+
def update_target_stage_by_reported_at(doc, target_stage):
|
|
115
121
|
date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
|
|
116
122
|
try:
|
|
117
123
|
reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
|
@@ -120,7 +126,7 @@ class DocumentProcessor:
|
|
|
120
126
|
return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
|
|
121
127
|
|
|
122
128
|
@classmethod
|
|
123
|
-
def create_record(cls, doc, start_stage):
|
|
129
|
+
async def create_record(cls, doc, start_stage):
|
|
124
130
|
attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
|
|
125
131
|
s3_path_info = []
|
|
126
132
|
add_extends = {}
|
|
@@ -133,7 +139,7 @@ class DocumentProcessor:
|
|
|
133
139
|
if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
|
|
134
140
|
add_extends = {
|
|
135
141
|
"original_title": doc['x_orbit_data']['report_title'],
|
|
136
|
-
"title": cls.translate_text(doc['x_orbit_data']['report_title']),
|
|
142
|
+
"title": await cls.translate_text(text=doc['x_orbit_data']['report_title']),
|
|
137
143
|
"published": doc['x_reported_at_utc_date'],
|
|
138
144
|
"tickers": [],
|
|
139
145
|
"perm_id_list": doc['x_orbit_data']['perm_id_list'],
|
|
@@ -159,7 +165,7 @@ class DocumentProcessor:
|
|
|
159
165
|
}
|
|
160
166
|
|
|
161
167
|
@classmethod
|
|
162
|
-
def process(cls, doc,
|
|
168
|
+
async def process(cls, doc, custom_process_step):
|
|
163
169
|
report_id = doc['_id']
|
|
164
170
|
# 筛选文件
|
|
165
171
|
doc = cls.stock_us_filter_by_is_primary(doc)
|
|
@@ -173,19 +179,23 @@ class DocumentProcessor:
|
|
|
173
179
|
if is_xbrl:
|
|
174
180
|
return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
|
|
175
181
|
|
|
176
|
-
start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
|
|
182
|
+
start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
|
|
177
183
|
|
|
178
|
-
# 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
|
|
179
|
-
if target_stage == 'embedding' and
|
|
180
|
-
target_stage = cls.
|
|
181
|
-
target_stage = cls.
|
|
184
|
+
# 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
|
|
185
|
+
if target_stage == 'embedding' and not custom_process_step:
|
|
186
|
+
target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
|
|
187
|
+
target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
|
|
188
|
+
# 特殊情况下只需要做提取但是这个数据被过滤不需要做embedding
|
|
189
|
+
if start_stage == 'embedding' and target_stage == 'extract':
|
|
190
|
+
start_stage = 'success'
|
|
191
|
+
target_stage = 'success'
|
|
182
192
|
|
|
183
193
|
if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
|
|
184
194
|
return cls.create_result_info("step_error",
|
|
185
195
|
"Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
|
|
186
196
|
doc['_id'])
|
|
187
197
|
|
|
188
|
-
file_name_check_status, record = cls.create_record(doc, start_stage)
|
|
198
|
+
file_name_check_status, record = await cls.create_record(doc, start_stage)
|
|
189
199
|
if not file_name_check_status:
|
|
190
200
|
return cls.create_result_info("step_error", "Document file name too lang.", report_id)
|
|
191
201
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from collections import Counter
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Literal
|
|
4
|
+
from typing import Literal, Optional
|
|
5
5
|
import logging
|
|
6
6
|
import pymongo
|
|
7
7
|
import pytz
|
|
@@ -30,18 +30,15 @@ class FilingOfficialProcessor:
|
|
|
30
30
|
self.mongo_client = pymongo.MongoClient(mongo_uri)
|
|
31
31
|
self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
|
|
32
32
|
self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
|
|
33
|
-
self.filing_reports_astock_test0822_collection = self.mongo_client['filing_reports']['filing_reports_astock_test0822']
|
|
34
33
|
self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
|
|
35
34
|
'annotation_reports_view_rows']
|
|
36
35
|
self.source_map = {
|
|
37
36
|
'filing_data': (self.filing_data_collection, 'filing_data'),
|
|
38
|
-
'
|
|
37
|
+
'G7_demo': (self.filing_data_collection, 'G7_demo'),
|
|
39
38
|
'reports_view': [
|
|
40
|
-
(self.filing_data_collection, 'filing_data')
|
|
41
|
-
(self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822')
|
|
39
|
+
(self.filing_data_collection, 'filing_data')
|
|
42
40
|
]
|
|
43
41
|
}
|
|
44
|
-
self.only_low_important_set = {'internal_seekingalpha'}
|
|
45
42
|
postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
|
|
46
43
|
if not postgres_uri:
|
|
47
44
|
raise KeyError('postgres_uri not set.')
|
|
@@ -122,7 +119,7 @@ class FilingOfficialProcessor:
|
|
|
122
119
|
"x_status_list.status_convert.status_meta": "meta_init",
|
|
123
120
|
"x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
|
|
124
121
|
}})
|
|
125
|
-
logger.info(f'
|
|
122
|
+
logger.info(f'Unable to convert {len(report_id_list)} document(s) due to unsupported file type.')
|
|
126
123
|
|
|
127
124
|
def update_extends_fields(self, perm_id_list, file_flow_info):
|
|
128
125
|
stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
|
|
@@ -138,10 +135,10 @@ class FilingOfficialProcessor:
|
|
|
138
135
|
for step_info, records in file_flow_info.items():
|
|
139
136
|
for record in records:
|
|
140
137
|
if 'extends' in record and record.get('extends') is not None:
|
|
141
|
-
tickers =
|
|
138
|
+
tickers = set()
|
|
142
139
|
for i in record['extends']['perm_id_list']:
|
|
143
|
-
tickers.
|
|
144
|
-
record['extends']['tickers'] = tickers
|
|
140
|
+
tickers.update(orbit_entity_id_ticker_map.get(i, []))
|
|
141
|
+
record['extends']['tickers'] = list(tickers)
|
|
145
142
|
|
|
146
143
|
record['extends']['report_type_id_list_str'] = [self.report_type_id_name_map.get(i) for i in record['extends']['report_type_id_list_str']]
|
|
147
144
|
|
|
@@ -160,7 +157,7 @@ class FilingOfficialProcessor:
|
|
|
160
157
|
f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: False, message: 'File has already completed the embedding stage.' ")
|
|
161
158
|
continue
|
|
162
159
|
|
|
163
|
-
if is_important
|
|
160
|
+
if is_important:
|
|
164
161
|
logger.info(f"is_important: {is_important} - {x_spider_name}")
|
|
165
162
|
status, ids, message = self.file_handler.entry_point_urgent(records=records, start_stage=start_stage,
|
|
166
163
|
target_stage=target_stage,
|
|
@@ -178,9 +175,19 @@ class FilingOfficialProcessor:
|
|
|
178
175
|
logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
|
|
179
176
|
|
|
180
177
|
|
|
181
|
-
def process_task_entry(self, source: Literal["filing_data", "
|
|
178
|
+
async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
|
|
182
179
|
query: dict, tags: list[str], priority: str,
|
|
183
|
-
is_important: bool = False,
|
|
180
|
+
is_important: bool = False, custom_step: Optional[list[str]] = None):
|
|
181
|
+
|
|
182
|
+
allowed_steps = {"convert", "extract", "embedding"}
|
|
183
|
+
if custom_step is not None:
|
|
184
|
+
if not isinstance(custom_step, list):
|
|
185
|
+
raise ValueError("custom_step must be a list or None.")
|
|
186
|
+
if len(custom_step) > 2:
|
|
187
|
+
raise ValueError("custom_step can contain at most two elements.")
|
|
188
|
+
for step in custom_step:
|
|
189
|
+
if step not in allowed_steps:
|
|
190
|
+
raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
|
|
184
191
|
|
|
185
192
|
if source == 'reports_view':
|
|
186
193
|
collections = self.source_map[source]
|
|
@@ -199,7 +206,8 @@ class FilingOfficialProcessor:
|
|
|
199
206
|
self.all_stat_count['all'] += 1
|
|
200
207
|
for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
|
|
201
208
|
perm_id_set.add(orbit_entity_id)
|
|
202
|
-
|
|
209
|
+
result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step)
|
|
210
|
+
process_data.append(result_record)
|
|
203
211
|
if len(process_data) >= self.max_batch_size:
|
|
204
212
|
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
205
213
|
process_data)
|
|
@@ -226,4 +234,5 @@ class FilingOfficialProcessor:
|
|
|
226
234
|
process_data.clear()
|
|
227
235
|
perm_id_set.clear()
|
|
228
236
|
|
|
229
|
-
logger.info(f"finish processing {self.all_stat_count}.")
|
|
237
|
+
logger.info(f"finish processing {self.all_stat_count}. \n")
|
|
238
|
+
self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
@@ -27,9 +27,9 @@ class FlowUpdater:
|
|
|
27
27
|
setattr(self, collection_name, self.coon[data_source])
|
|
28
28
|
return getattr(self, collection_name)
|
|
29
29
|
|
|
30
|
-
def _handle_convert(self, status, attachments, db_store_path_set):
|
|
31
|
-
if not attachments:
|
|
32
|
-
raise ValueError("
|
|
30
|
+
def _handle_convert(self, status, attachments, db_store_path_set, attachments_pdf):
|
|
31
|
+
if not attachments or not attachments_pdf:
|
|
32
|
+
raise ValueError("Missing attachments: neither 'attachments' nor 'attachments_pdf' was provided.")
|
|
33
33
|
if not status:
|
|
34
34
|
return {
|
|
35
35
|
'x_status_list.status_convert.status': 'convert_failed',
|
|
@@ -38,15 +38,18 @@ class FlowUpdater:
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
store_path_set = set()
|
|
41
|
+
parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
|
|
41
42
|
x_attachments_pdf = []
|
|
42
43
|
for item in attachments:
|
|
43
44
|
store_path = item['store_path']
|
|
45
|
+
parent_id = item['id']
|
|
44
46
|
if store_path not in db_store_path_set:
|
|
45
47
|
raise ValueError(f"store_path not found in db: {store_path}")
|
|
46
48
|
if store_path in store_path_set:
|
|
47
49
|
continue
|
|
48
50
|
store_path_set.add(store_path)
|
|
49
|
-
new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
|
|
51
|
+
# new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
|
|
52
|
+
new_store_path = parent_id_store_path_map[parent_id]
|
|
50
53
|
x_attachments_pdf.append({
|
|
51
54
|
"store_path": new_store_path,
|
|
52
55
|
"store_path_txt": "",
|
|
@@ -76,11 +79,11 @@ class FlowUpdater:
|
|
|
76
79
|
return {}
|
|
77
80
|
return {'x_status_list.status_convert.status_txt': 'convert_txt_embedding'}
|
|
78
81
|
|
|
79
|
-
def _step_handle(self, step_stage, status, attachments, db_store_path):
|
|
82
|
+
def _step_handle(self, step_stage, status, attachments, db_store_path, attachments_pdf):
|
|
80
83
|
method_name = f"_handle_{step_stage}"
|
|
81
84
|
method = getattr(self, method_name, None)
|
|
82
85
|
if method:
|
|
83
|
-
return method(status, attachments=attachments,
|
|
86
|
+
return method(status, attachments=attachments, attachments_pdf=attachments_pdf,
|
|
84
87
|
db_store_path_set=db_store_path) if step_stage == 'convert' else method(status)
|
|
85
88
|
else:
|
|
86
89
|
raise ValueError(f"Unknown step_stage: {step_stage}")
|
|
@@ -102,6 +105,7 @@ class FlowUpdater:
|
|
|
102
105
|
current_stage = op_meta_record['current_stage']
|
|
103
106
|
target_stage = op_meta_record['target_stage']
|
|
104
107
|
attachments = op_meta_record['x_attachments']
|
|
108
|
+
attachments_pdf = op_meta_record['x_attachments_pdf']
|
|
105
109
|
data_source = op_meta_record['data_source']
|
|
106
110
|
|
|
107
111
|
# 校验参数
|
|
@@ -148,7 +152,7 @@ class FlowUpdater:
|
|
|
148
152
|
if step == end_stage and status == 'failed':
|
|
149
153
|
step_status = False
|
|
150
154
|
logger.info(f' Processing step-{index} {step} - {"successfully" if step_status else "failed"}.')
|
|
151
|
-
item = self._step_handle(step, step_status, attachments, db_store_path)
|
|
155
|
+
item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
|
|
152
156
|
update_params.update(item)
|
|
153
157
|
|
|
154
158
|
# 执行更新
|
orbitkit-0.8.41/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.41
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|