orbitkit 0.8.44__tar.gz → 0.8.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {orbitkit-0.8.44/orbitkit.egg-info → orbitkit-0.8.46}/PKG-INFO +1 -1
  2. orbitkit-0.8.46/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/data_preprocessing.py +4 -4
  4. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_flow_entry_process.py +8 -2
  5. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_flow_exit_process.py +3 -8
  6. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_handler_v2.py +2 -1
  7. {orbitkit-0.8.44 → orbitkit-0.8.46/orbitkit.egg-info}/PKG-INFO +1 -1
  8. orbitkit-0.8.44/orbitkit/VERSION +0 -1
  9. {orbitkit-0.8.44 → orbitkit-0.8.46}/LICENSE +0 -0
  10. {orbitkit-0.8.44 → orbitkit-0.8.46}/MANIFEST.in +0 -0
  11. {orbitkit-0.8.44 → orbitkit-0.8.46}/README.md +0 -0
  12. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/__init__.py +0 -0
  13. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/__init__.py +0 -0
  14. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/audio_transcoder/__init__.py +0 -0
  16. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  17. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/constant/__init__.py +0 -0
  18. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/constant/report_schema.py +0 -0
  19. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/id_srv/__init__.py +0 -0
  20. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/id_srv/id_gen.py +0 -0
  21. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/id_srv/id_perm_like.py +0 -0
  22. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/lark_send/__init__.py +0 -0
  23. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/lark_send/lark.py +0 -0
  24. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/llm_tools/__init__.py +0 -0
  25. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  26. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/__init__.py +0 -0
  27. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  28. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  29. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/orbit_type/tools.py +0 -0
  30. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_embedding/__init__.py +0 -0
  31. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  32. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  33. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/__init__.py +0 -0
  34. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  35. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/exceptions.py +0 -0
  36. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  37. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  38. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  39. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  40. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  41. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  42. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  43. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  44. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/cache_asset_downloader.py +0 -0
  56. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aliyun.py +0 -0
  60. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  61. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aws.py +0 -0
  62. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  63. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_date.py +0 -0
  64. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_html.py +0 -0
  65. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_kafka.py +0 -0
  66. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_md5.py +0 -0
  67. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_selenium.py +0 -0
  68. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_simple_timer.py +0 -0
  69. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_str.py +0 -0
  70. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_type_mapping.py +0 -0
  71. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit/util/util_url.py +0 -0
  72. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/SOURCES.txt +0 -0
  73. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/dependency_links.txt +0 -0
  74. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/not-zip-safe +0 -0
  75. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/requires.txt +0 -0
  76. {orbitkit-0.8.44 → orbitkit-0.8.46}/orbitkit.egg-info/top_level.txt +0 -0
  77. {orbitkit-0.8.44 → orbitkit-0.8.46}/setup.cfg +0 -0
  78. {orbitkit-0.8.44 → orbitkit-0.8.46}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.44
3
+ Version: 0.8.46
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.46
@@ -126,7 +126,7 @@ class DocumentProcessor:
126
126
  return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
127
127
 
128
128
  @classmethod
129
- async def create_record(cls, doc, start_stage):
129
+ async def create_record(cls, doc, start_stage, important_level):
130
130
  attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
131
131
  s3_path_info = []
132
132
  add_extends = {}
@@ -151,7 +151,7 @@ class DocumentProcessor:
151
151
  'store_path'],
152
152
  'file_name': att['file_name']
153
153
  })
154
- result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info}
154
+ result_dict = {'id': doc['_id'], 's3_path_info': s3_path_info, 'important_level': important_level}
155
155
  if add_extends:
156
156
  result_dict['extends'] = add_extends
157
157
  return True, result_dict
@@ -165,7 +165,7 @@ class DocumentProcessor:
165
165
  }
166
166
 
167
167
  @classmethod
168
- async def process(cls, doc, custom_process_step):
168
+ async def process(cls, doc, custom_process_step, important_level):
169
169
  report_id = doc['_id']
170
170
  # 筛选文件
171
171
  doc = cls.stock_us_filter_by_is_primary(doc)
@@ -195,7 +195,7 @@ class DocumentProcessor:
195
195
  "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
196
196
  doc['_id'])
197
197
 
198
- file_name_check_status, record = await cls.create_record(doc, start_stage)
198
+ file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
199
199
  if not file_name_check_status:
200
200
  return cls.create_result_info("step_error", "Document file name too lang.", report_id)
201
201
 
@@ -177,7 +177,13 @@ class FilingOfficialProcessor:
177
177
 
178
178
  async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
179
179
  query: dict, tags: list[str], priority: str,
180
- is_important: bool = False, custom_step: Optional[list[str]] = None):
180
+ is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None):
181
+
182
+ if not important_level or not isinstance(important_level, int):
183
+ important_level = 0
184
+
185
+ if important_level == 0:
186
+ raise ValueError(f'important_level must be an integer (int) greater than 0. {important_level}')
181
187
 
182
188
  allowed_steps = {"convert", "extract", "embedding"}
183
189
  if custom_step is not None:
@@ -206,7 +212,7 @@ class FilingOfficialProcessor:
206
212
  self.all_stat_count['all'] += 1
207
213
  for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
208
214
  perm_id_set.add(orbit_entity_id)
209
- result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step)
215
+ result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
210
216
  process_data.append(result_record)
211
217
  if len(process_data) >= self.max_batch_size:
212
218
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
@@ -42,7 +42,9 @@ class FlowUpdater:
42
42
  parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
43
43
  x_attachments_pdf = []
44
44
  for item in attachments:
45
- store_path = item['store_path']
45
+ # The video branch will generate store_path_pre, so check for the existence of store_path_pre first.
46
+ # For other branches, the value will be null.
47
+ store_path = item['store_path_pre'] or item['store_path']
46
48
  parent_id = item['id']
47
49
  if store_path not in db_store_path_set:
48
50
  raise ValueError(f"store_path not found in db: {store_path}")
@@ -109,7 +111,6 @@ class FlowUpdater:
109
111
  attachments_pdf = op_meta_record['x_attachments_pdf']
110
112
  data_source = op_meta_record['data_source']
111
113
 
112
- # 校验参数
113
114
  if not report_id or not status or not start_stage or not current_stage or not target_stage or (not attachments and start_stage == 'convert'):
114
115
  raise ValueError(f"Invalid op_meta_record: {op_meta_record}")
115
116
  if status == 'success' and target_stage != current_stage:
@@ -117,7 +118,6 @@ class FlowUpdater:
117
118
  return
118
119
  attachments = [i for i in attachments if i['category'] == 'x_attachments']
119
120
 
120
- # 确定结束阶段
121
121
  end_stage = target_stage if status == 'success' else current_stage if status == 'failed' else None
122
122
  if end_stage is None:
123
123
  logger.info(f"Invalid status: {status}.")
@@ -127,11 +127,9 @@ class FlowUpdater:
127
127
  if start_index > end_index:
128
128
  raise ValueError(f"start_stage cannot be after end_stage: {start_stage} -> {end_stage}.")
129
129
 
130
- # 开始执行回更逻辑
131
130
  logger.info(
132
131
  f"😊 _id: {report_id}-{status}, start_step: {self.step_tuple[start_index]}, end_step: {self.step_tuple[end_index]}")
133
132
 
134
- # 查询这个报告是否存在于当前数据源
135
133
  db_doc = self._check_and_create_collection(data_source).find_one({'_id': report_id},
136
134
  {'_id': 1, 'x_attachments': 1,
137
135
  'x_status_list': 1})
@@ -143,10 +141,8 @@ class FlowUpdater:
143
141
  logger.warning(f"{db_doc['_id']} statxus is not 'crawl_downloaded'")
144
142
  return
145
143
 
146
- # 构建数据库中存在store_path 防止出现数据库中x_attachments与x_attachments_pdf数据不一致问题
147
144
  db_store_path = {f"s3://{i['bucket']}/{i['store_path']}" for i in db_doc['x_attachments']}
148
145
 
149
- # 构建更新参数
150
146
  update_params = {}
151
147
  step_status = True
152
148
  for index, step in enumerate(self.step_tuple[start_index:end_index + 1], 1):
@@ -156,7 +152,6 @@ class FlowUpdater:
156
152
  item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
157
153
  update_params.update(item)
158
154
 
159
- # 执行更新
160
155
  if update_params:
161
156
  # logger.info(json.dumps(update_params, ensure_ascii=False, indent=2))
162
157
  self.update_mongo_data(report_id, data_source, update_params, kafka_ignore)
@@ -137,7 +137,8 @@ class FileFlowHandleV2:
137
137
  'created_at': now,
138
138
  'updated_at': now,
139
139
  'tags': params['tags'],
140
- 'tag': params['tag']
140
+ 'tag': params['tag'],
141
+ 'important_level': record.get('important_level', 0)
141
142
  }
142
143
 
143
144
  step = {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.44
3
+ Version: 0.8.46
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.44
File without changes
File without changes
File without changes
File without changes
File without changes