orbitkit 0.8.43__tar.gz → 0.8.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.43/orbitkit.egg-info → orbitkit-0.8.45}/PKG-INFO +1 -1
- orbitkit-0.8.45/orbitkit/VERSION +1 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/airflow_handler/file_flow_exit_process.py +6 -10
- {orbitkit-0.8.43 → orbitkit-0.8.45/orbitkit.egg-info}/PKG-INFO +1 -1
- orbitkit-0.8.43/orbitkit/VERSION +0 -1
- {orbitkit-0.8.43 → orbitkit-0.8.45}/LICENSE +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/MANIFEST.in +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/README.md +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/airflow_handler/data_preprocessing.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/airflow_handler/file_flow_entry_process.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/setup.cfg +0 -0
- {orbitkit-0.8.43 → orbitkit-0.8.45}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.45
|
|
@@ -28,8 +28,6 @@ class FlowUpdater:
|
|
|
28
28
|
return getattr(self, collection_name)
|
|
29
29
|
|
|
30
30
|
def _handle_convert(self, status, attachments, db_store_path_set, attachments_pdf):
|
|
31
|
-
if not attachments or not attachments_pdf:
|
|
32
|
-
raise ValueError("Missing attachments: neither 'attachments' nor 'attachments_pdf' was provided.")
|
|
33
31
|
if not status:
|
|
34
32
|
return {
|
|
35
33
|
'x_status_list.status_convert.status': 'convert_failed',
|
|
@@ -37,11 +35,16 @@ class FlowUpdater:
|
|
|
37
35
|
'x_status_list.status_convert.status_meta': 'meta_init'
|
|
38
36
|
}
|
|
39
37
|
|
|
38
|
+
if not attachments or not attachments_pdf:
|
|
39
|
+
raise ValueError("Missing attachments: neither 'attachments' nor 'attachments_pdf' was provided.")
|
|
40
|
+
|
|
40
41
|
store_path_set = set()
|
|
41
42
|
parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
|
|
42
43
|
x_attachments_pdf = []
|
|
43
44
|
for item in attachments:
|
|
44
|
-
|
|
45
|
+
# The video branch will generate store_path_pre, so check for the existence of store_path_pre first.
|
|
46
|
+
# For other branches, the value will be null.
|
|
47
|
+
store_path = item['store_path_pre'] or item['store_path']
|
|
45
48
|
parent_id = item['id']
|
|
46
49
|
if store_path not in db_store_path_set:
|
|
47
50
|
raise ValueError(f"store_path not found in db: {store_path}")
|
|
@@ -108,7 +111,6 @@ class FlowUpdater:
|
|
|
108
111
|
attachments_pdf = op_meta_record['x_attachments_pdf']
|
|
109
112
|
data_source = op_meta_record['data_source']
|
|
110
113
|
|
|
111
|
-
# 校验参数
|
|
112
114
|
if not report_id or not status or not start_stage or not current_stage or not target_stage or (not attachments and start_stage == 'convert'):
|
|
113
115
|
raise ValueError(f"Invalid op_meta_record: {op_meta_record}")
|
|
114
116
|
if status == 'success' and target_stage != current_stage:
|
|
@@ -116,7 +118,6 @@ class FlowUpdater:
|
|
|
116
118
|
return
|
|
117
119
|
attachments = [i for i in attachments if i['category'] == 'x_attachments']
|
|
118
120
|
|
|
119
|
-
# 确定结束阶段
|
|
120
121
|
end_stage = target_stage if status == 'success' else current_stage if status == 'failed' else None
|
|
121
122
|
if end_stage is None:
|
|
122
123
|
logger.info(f"Invalid status: {status}.")
|
|
@@ -126,11 +127,9 @@ class FlowUpdater:
|
|
|
126
127
|
if start_index > end_index:
|
|
127
128
|
raise ValueError(f"start_stage cannot be after end_stage: {start_stage} -> {end_stage}.")
|
|
128
129
|
|
|
129
|
-
# 开始执行回更逻辑
|
|
130
130
|
logger.info(
|
|
131
131
|
f"😊 _id: {report_id}-{status}, start_step: {self.step_tuple[start_index]}, end_step: {self.step_tuple[end_index]}")
|
|
132
132
|
|
|
133
|
-
# 查询这个报告是否存在于当前数据源
|
|
134
133
|
db_doc = self._check_and_create_collection(data_source).find_one({'_id': report_id},
|
|
135
134
|
{'_id': 1, 'x_attachments': 1,
|
|
136
135
|
'x_status_list': 1})
|
|
@@ -142,10 +141,8 @@ class FlowUpdater:
|
|
|
142
141
|
logger.warning(f"{db_doc['_id']} statxus is not 'crawl_downloaded'")
|
|
143
142
|
return
|
|
144
143
|
|
|
145
|
-
# 构建数据库中存在store_path 防止出现数据库中x_attachments与x_attachments_pdf数据不一致问题
|
|
146
144
|
db_store_path = {f"s3://{i['bucket']}/{i['store_path']}" for i in db_doc['x_attachments']}
|
|
147
145
|
|
|
148
|
-
# 构建更新参数
|
|
149
146
|
update_params = {}
|
|
150
147
|
step_status = True
|
|
151
148
|
for index, step in enumerate(self.step_tuple[start_index:end_index + 1], 1):
|
|
@@ -155,7 +152,6 @@ class FlowUpdater:
|
|
|
155
152
|
item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
|
|
156
153
|
update_params.update(item)
|
|
157
154
|
|
|
158
|
-
# 执行更新
|
|
159
155
|
if update_params:
|
|
160
156
|
# logger.info(json.dumps(update_params, ensure_ascii=False, indent=2))
|
|
161
157
|
self.update_mongo_data(report_id, data_source, update_params, kafka_ignore)
|
orbitkit-0.8.43/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.43
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|