orbitkit 0.8.44__tar.gz → 0.8.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.44/orbitkit.egg-info → orbitkit-0.8.46}/PKG-INFO +1 -1
- orbitkit-0.8.46/orbitkit/VERSION +1 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/data_preprocessing.py +4 -4
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_flow_entry_process.py +8 -2
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_flow_exit_process.py +3 -8
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_handler_v2.py +2 -1
- {orbitkit-0.8.44 → orbitkit-0.8.46/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.44/orbitkit/VERSION +0 -1
- {orbitkit-0.8.44 → orbitkit-0.8.46}/LICENSE +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/MANIFEST.in +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/README.md +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/setup.cfg +0 -0
- {orbitkit-0.8.44 → orbitkit-0.8.46}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.46
|
|
@@ -126,7 +126,7 @@ class DocumentProcessor:
|
|
|
126
126
|
return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
|
|
127
127
|
|
|
128
128
|
@classmethod
|
|
129
|
-
async def create_record(cls, doc, start_stage):
|
|
129
|
+
async def create_record(cls, doc, start_stage, important_level):
|
|
130
130
|
attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
|
|
131
131
|
s3_path_info = []
|
|
132
132
|
add_extends = {}
|
|
@@ -151,7 +151,7 @@ class DocumentProcessor:
|
|
|
151
151
|
'store_path'],
|
|
152
152
|
'file_name': att['file_name']
|
|
153
153
|
})
|
|
154
|
-
result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info}
|
|
154
|
+
result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info, 'important_level': important_level}
|
|
155
155
|
if add_extends:
|
|
156
156
|
result_dict['extends'] = add_extends
|
|
157
157
|
return True, result_dict
|
|
@@ -165,7 +165,7 @@ class DocumentProcessor:
|
|
|
165
165
|
}
|
|
166
166
|
|
|
167
167
|
@classmethod
|
|
168
|
-
async def process(cls, doc, custom_process_step):
|
|
168
|
+
async def process(cls, doc, custom_process_step, important_level):
|
|
169
169
|
report_id = doc['_id']
|
|
170
170
|
# 筛选文件
|
|
171
171
|
doc = cls.stock_us_filter_by_is_primary(doc)
|
|
@@ -195,7 +195,7 @@ class DocumentProcessor:
|
|
|
195
195
|
"Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
|
|
196
196
|
doc['_id'])
|
|
197
197
|
|
|
198
|
-
file_name_check_status, record = await cls.create_record(doc, start_stage)
|
|
198
|
+
file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
|
|
199
199
|
if not file_name_check_status:
|
|
200
200
|
return cls.create_result_info("step_error", "Document file name too lang.", report_id)
|
|
201
201
|
|
|
@@ -177,7 +177,13 @@ class FilingOfficialProcessor:
|
|
|
177
177
|
|
|
178
178
|
async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
|
|
179
179
|
query: dict, tags: list[str], priority: str,
|
|
180
|
-
is_important: bool = False, custom_step: Optional[list[str]] = None):
|
|
180
|
+
is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None):
|
|
181
|
+
|
|
182
|
+
if not important_level or not isinstance(important_level, int):
|
|
183
|
+
important_level = 0
|
|
184
|
+
|
|
185
|
+
if important_level == 0:
|
|
186
|
+
raise ValueError(f'important_level must be an integer (int) greater than 0. {important_level}')
|
|
181
187
|
|
|
182
188
|
allowed_steps = {"convert", "extract", "embedding"}
|
|
183
189
|
if custom_step is not None:
|
|
@@ -206,7 +212,7 @@ class FilingOfficialProcessor:
|
|
|
206
212
|
self.all_stat_count['all'] += 1
|
|
207
213
|
for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
|
|
208
214
|
perm_id_set.add(orbit_entity_id)
|
|
209
|
-
result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step)
|
|
215
|
+
result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
|
|
210
216
|
process_data.append(result_record)
|
|
211
217
|
if len(process_data) >= self.max_batch_size:
|
|
212
218
|
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
@@ -42,7 +42,9 @@ class FlowUpdater:
|
|
|
42
42
|
parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
|
|
43
43
|
x_attachments_pdf = []
|
|
44
44
|
for item in attachments:
|
|
45
|
-
|
|
45
|
+
# The video branch will generate store_path_pre, so check for the existence of store_path_pre first.
|
|
46
|
+
# For other branches, the value will be null.
|
|
47
|
+
store_path = item['store_path_pre'] or item['store_path']
|
|
46
48
|
parent_id = item['id']
|
|
47
49
|
if store_path not in db_store_path_set:
|
|
48
50
|
raise ValueError(f"store_path not found in db: {store_path}")
|
|
@@ -109,7 +111,6 @@ class FlowUpdater:
|
|
|
109
111
|
attachments_pdf = op_meta_record['x_attachments_pdf']
|
|
110
112
|
data_source = op_meta_record['data_source']
|
|
111
113
|
|
|
112
|
-
# 校验参数
|
|
113
114
|
if not report_id or not status or not start_stage or not current_stage or not target_stage or (not attachments and start_stage == 'convert'):
|
|
114
115
|
raise ValueError(f"Invalid op_meta_record: {op_meta_record}")
|
|
115
116
|
if status == 'success' and target_stage != current_stage:
|
|
@@ -117,7 +118,6 @@ class FlowUpdater:
|
|
|
117
118
|
return
|
|
118
119
|
attachments = [i for i in attachments if i['category'] == 'x_attachments']
|
|
119
120
|
|
|
120
|
-
# 确定结束阶段
|
|
121
121
|
end_stage = target_stage if status == 'success' else current_stage if status == 'failed' else None
|
|
122
122
|
if end_stage is None:
|
|
123
123
|
logger.info(f"Invalid status: {status}.")
|
|
@@ -127,11 +127,9 @@ class FlowUpdater:
|
|
|
127
127
|
if start_index > end_index:
|
|
128
128
|
raise ValueError(f"start_stage cannot be after end_stage: {start_stage} -> {end_stage}.")
|
|
129
129
|
|
|
130
|
-
# 开始执行回更逻辑
|
|
131
130
|
logger.info(
|
|
132
131
|
f"😊 _id: {report_id}-{status}, start_step: {self.step_tuple[start_index]}, end_step: {self.step_tuple[end_index]}")
|
|
133
132
|
|
|
134
|
-
# 查询这个报告是否存在于当前数据源
|
|
135
133
|
db_doc = self._check_and_create_collection(data_source).find_one({'_id': report_id},
|
|
136
134
|
{'_id': 1, 'x_attachments': 1,
|
|
137
135
|
'x_status_list': 1})
|
|
@@ -143,10 +141,8 @@ class FlowUpdater:
|
|
|
143
141
|
logger.warning(f"{db_doc['_id']} statxus is not 'crawl_downloaded'")
|
|
144
142
|
return
|
|
145
143
|
|
|
146
|
-
# 构建数据库中存在store_path 防止出现数据库中x_attachments与x_attachments_pdf数据不一致问题
|
|
147
144
|
db_store_path = {f"s3://{i['bucket']}/{i['store_path']}" for i in db_doc['x_attachments']}
|
|
148
145
|
|
|
149
|
-
# 构建更新参数
|
|
150
146
|
update_params = {}
|
|
151
147
|
step_status = True
|
|
152
148
|
for index, step in enumerate(self.step_tuple[start_index:end_index + 1], 1):
|
|
@@ -156,7 +152,6 @@ class FlowUpdater:
|
|
|
156
152
|
item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
|
|
157
153
|
update_params.update(item)
|
|
158
154
|
|
|
159
|
-
# 执行更新
|
|
160
155
|
if update_params:
|
|
161
156
|
# logger.info(json.dumps(update_params, ensure_ascii=False, indent=2))
|
|
162
157
|
self.update_mongo_data(report_id, data_source, update_params, kafka_ignore)
|
orbitkit-0.8.44/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.44
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|