orbitkit 0.8.41__tar.gz → 0.8.43__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {orbitkit-0.8.41/orbitkit.egg-info → orbitkit-0.8.43}/PKG-INFO +1 -1
  2. orbitkit-0.8.43/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/data_preprocessing.py +27 -17
  4. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_flow_entry_process.py +24 -15
  5. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_flow_exit_process.py +11 -7
  6. {orbitkit-0.8.41 → orbitkit-0.8.43/orbitkit.egg-info}/PKG-INFO +1 -1
  7. orbitkit-0.8.41/orbitkit/VERSION +0 -1
  8. {orbitkit-0.8.41 → orbitkit-0.8.43}/LICENSE +0 -0
  9. {orbitkit-0.8.41 → orbitkit-0.8.43}/MANIFEST.in +0 -0
  10. {orbitkit-0.8.41 → orbitkit-0.8.43}/README.md +0 -0
  11. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/__init__.py +0 -0
  12. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/__init__.py +0 -0
  13. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_handler.py +0 -0
  14. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  15. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/audio_transcoder/__init__.py +0 -0
  16. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  17. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/constant/__init__.py +0 -0
  18. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/constant/report_schema.py +0 -0
  19. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/id_srv/__init__.py +0 -0
  20. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/id_srv/id_gen.py +0 -0
  21. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/id_srv/id_perm_like.py +0 -0
  22. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/lark_send/__init__.py +0 -0
  23. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/lark_send/lark.py +0 -0
  24. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/llm_tools/__init__.py +0 -0
  25. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  26. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/__init__.py +0 -0
  27. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  28. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  29. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/orbit_type/tools.py +0 -0
  30. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_embedding/__init__.py +0 -0
  31. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  32. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  33. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/__init__.py +0 -0
  34. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  35. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/exceptions.py +0 -0
  36. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  37. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  38. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  39. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  40. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  41. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  42. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  43. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  44. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/cache_asset_downloader.py +0 -0
  56. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aliyun.py +0 -0
  60. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  61. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aws.py +0 -0
  62. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  63. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_date.py +0 -0
  64. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_html.py +0 -0
  65. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_kafka.py +0 -0
  66. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_md5.py +0 -0
  67. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_selenium.py +0 -0
  68. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_simple_timer.py +0 -0
  69. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_str.py +0 -0
  70. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_type_mapping.py +0 -0
  71. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit/util/util_url.py +0 -0
  72. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/SOURCES.txt +0 -0
  73. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/dependency_links.txt +0 -0
  74. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/not-zip-safe +0 -0
  75. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/requires.txt +0 -0
  76. {orbitkit-0.8.41 → orbitkit-0.8.43}/orbitkit.egg-info/top_level.txt +0 -0
  77. {orbitkit-0.8.41 → orbitkit-0.8.43}/setup.cfg +0 -0
  78. {orbitkit-0.8.41 → orbitkit-0.8.43}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.41
3
+ Version: 0.8.43
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.43
@@ -1,11 +1,13 @@
1
1
  import os
2
2
  import datetime
3
3
  from collections import defaultdict
4
- from googletrans import Translator
4
+ from importlib.metadata import version
5
+ import googletrans
5
6
 
6
7
 
7
8
  class DocumentProcessor:
8
-
9
+ if version("googletrans") < "4.0.2":
10
+ raise ImportError(f"googletrans >= 4.0.2 is required for async support. {version('googletrans')}")
9
11
  AUDIO_SUFFIXES = [".mp3", ".wav", ".aac", ".wma", ".m4a"]
10
12
  VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
11
13
  PDF_SUFFIXES = [".pdf"]
@@ -20,9 +22,10 @@ class DocumentProcessor:
20
22
  return f".{file_path.split('.')[-1]}".lower()
21
23
 
22
24
  @staticmethod
23
- def translate_text(text, dest='en'):
24
- translator = Translator()
25
- result = translator.translate(text, dest=dest)
25
+ async def translate_text(text, dest='en'):
26
+ """异步翻译函数 https://pypi.org/project/googletrans/"""
27
+ translator = googletrans.Translator()
28
+ result = await translator.translate(text, dest=dest)
26
29
  return result.text
27
30
 
28
31
  @staticmethod
@@ -88,12 +91,15 @@ class DocumentProcessor:
88
91
  return False, None
89
92
 
90
93
  @staticmethod
91
- def get_start_stage_target_stage(doc):
94
+ def get_start_stage_target_stage(doc, custom_process_step_list):
92
95
  status_info = doc.get('x_status_list', {}).get('status_convert', {})
93
96
  status = status_info.get('status')
94
97
  status_txt = status_info.get('status_txt')
95
98
  x_spider_name = doc['x_spider_name']
96
99
 
100
+ if custom_process_step_list:
101
+ return custom_process_step_list[0], custom_process_step_list[1], x_spider_name
102
+
97
103
  if status != 'convert_done':
98
104
  return 'convert', 'embedding', x_spider_name
99
105
 
@@ -106,12 +112,12 @@ class DocumentProcessor:
106
112
  return 'success', 'success', x_spider_name
107
113
 
108
114
  @staticmethod
109
- def update_target_stage_for_report_type(doc, target_stage):
115
+ def update_target_stage_by_report_type(doc, target_stage):
110
116
  report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
111
117
  return "extract" if report_type_ids == ['19999'] else target_stage
112
118
 
113
119
  @staticmethod
114
- def update_target_stage_for_reported_at(doc, target_stage):
120
+ def update_target_stage_by_reported_at(doc, target_stage):
115
121
  date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
116
122
  try:
117
123
  reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
@@ -120,7 +126,7 @@ class DocumentProcessor:
120
126
  return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
121
127
 
122
128
  @classmethod
123
- def create_record(cls, doc, start_stage):
129
+ async def create_record(cls, doc, start_stage):
124
130
  attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
125
131
  s3_path_info = []
126
132
  add_extends = {}
@@ -133,7 +139,7 @@ class DocumentProcessor:
133
139
  if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
134
140
  add_extends = {
135
141
  "original_title": doc['x_orbit_data']['report_title'],
136
- "title": cls.translate_text(doc['x_orbit_data']['report_title']),
142
+ "title": await cls.translate_text(text=doc['x_orbit_data']['report_title']),
137
143
  "published": doc['x_reported_at_utc_date'],
138
144
  "tickers": [],
139
145
  "perm_id_list": doc['x_orbit_data']['perm_id_list'],
@@ -159,7 +165,7 @@ class DocumentProcessor:
159
165
  }
160
166
 
161
167
  @classmethod
162
- def process(cls, doc, check_doc):
168
+ async def process(cls, doc, custom_process_step):
163
169
  report_id = doc['_id']
164
170
  # 筛选文件
165
171
  doc = cls.stock_us_filter_by_is_primary(doc)
@@ -173,19 +179,23 @@ class DocumentProcessor:
173
179
  if is_xbrl:
174
180
  return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
175
181
 
176
- start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
182
+ start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
177
183
 
178
- # 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
179
- if target_stage == 'embedding' and check_doc:
180
- target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
181
- target_stage = cls.update_target_stage_for_reported_at(doc, target_stage)
184
+ # 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
185
+ if target_stage == 'embedding' and not custom_process_step:
186
+ target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
187
+ target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
188
+ # 特殊情况下只需要做提取但是这个数据被过滤不需要做embedding
189
+ if start_stage == 'embedding' and target_stage == 'extract':
190
+ start_stage = 'success'
191
+ target_stage = 'success'
182
192
 
183
193
  if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
184
194
  return cls.create_result_info("step_error",
185
195
  "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
186
196
  doc['_id'])
187
197
 
188
- file_name_check_status, record = cls.create_record(doc, start_stage)
198
+ file_name_check_status, record = await cls.create_record(doc, start_stage)
189
199
  if not file_name_check_status:
190
200
  return cls.create_result_info("step_error", "Document file name too lang.", report_id)
191
201
 
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from collections import Counter
3
3
  from datetime import datetime
4
- from typing import Literal
4
+ from typing import Literal, Optional
5
5
  import logging
6
6
  import pymongo
7
7
  import pytz
@@ -30,18 +30,15 @@ class FilingOfficialProcessor:
30
30
  self.mongo_client = pymongo.MongoClient(mongo_uri)
31
31
  self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
32
32
  self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
33
- self.filing_reports_astock_test0822_collection = self.mongo_client['filing_reports']['filing_reports_astock_test0822']
34
33
  self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
35
34
  'annotation_reports_view_rows']
36
35
  self.source_map = {
37
36
  'filing_data': (self.filing_data_collection, 'filing_data'),
38
- 'filing_reports_astock_test0822': (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822'),
37
+ 'G7_demo': (self.filing_data_collection, 'G7_demo'),
39
38
  'reports_view': [
40
- (self.filing_data_collection, 'filing_data'),
41
- (self.filing_reports_astock_test0822_collection, 'filing_reports_astock_test0822')
39
+ (self.filing_data_collection, 'filing_data')
42
40
  ]
43
41
  }
44
- self.only_low_important_set = {'internal_seekingalpha'}
45
42
  postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
46
43
  if not postgres_uri:
47
44
  raise KeyError('postgres_uri not set.')
@@ -122,7 +119,7 @@ class FilingOfficialProcessor:
122
119
  "x_status_list.status_convert.status_meta": "meta_init",
123
120
  "x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
124
121
  }})
125
- logger.info(f'The document file type cannot be converted.')
122
+ logger.info(f'Unable to convert {len(report_id_list)} document(s) due to unsupported file type.')
126
123
 
127
124
  def update_extends_fields(self, perm_id_list, file_flow_info):
128
125
  stmt = select(self.pi2_table.c.orbit_entity_id, self.pi2_table.c.ticker).where(self.pi2_table.c.orbit_entity_id.in_(perm_id_list))
@@ -138,10 +135,10 @@ class FilingOfficialProcessor:
138
135
  for step_info, records in file_flow_info.items():
139
136
  for record in records:
140
137
  if 'extends' in record and record.get('extends') is not None:
141
- tickers = []
138
+ tickers = set()
142
139
  for i in record['extends']['perm_id_list']:
143
- tickers.extend(orbit_entity_id_ticker_map.get(i, []))
144
- record['extends']['tickers'] = tickers
140
+ tickers.update(orbit_entity_id_ticker_map.get(i, []))
141
+ record['extends']['tickers'] = list(tickers)
145
142
 
146
143
  record['extends']['report_type_id_list_str'] = [self.report_type_id_name_map.get(i) for i in record['extends']['report_type_id_list_str']]
147
144
 
@@ -160,7 +157,7 @@ class FilingOfficialProcessor:
160
157
  f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: False, message: 'File has already completed the embedding stage.' ")
161
158
  continue
162
159
 
163
- if is_important and x_spider_name not in self.only_low_important_set:
160
+ if is_important:
164
161
  logger.info(f"is_important: {is_important} - {x_spider_name}")
165
162
  status, ids, message = self.file_handler.entry_point_urgent(records=records, start_stage=start_stage,
166
163
  target_stage=target_stage,
@@ -178,9 +175,19 @@ class FilingOfficialProcessor:
178
175
  logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
179
176
 
180
177
 
181
- def process_task_entry(self, source: Literal["filing_data", "filing_reports_astock_test0822", "reports_view"],
178
+ async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
182
179
  query: dict, tags: list[str], priority: str,
183
- is_important: bool = False, check_doc: bool = True):
180
+ is_important: bool = False, custom_step: Optional[list[str]] = None):
181
+
182
+ allowed_steps = {"convert", "extract", "embedding"}
183
+ if custom_step is not None:
184
+ if not isinstance(custom_step, list):
185
+ raise ValueError("custom_step must be a list or None.")
186
+ if len(custom_step) > 2:
187
+ raise ValueError("custom_step can contain at most two elements.")
188
+ for step in custom_step:
189
+ if step not in allowed_steps:
190
+ raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
184
191
 
185
192
  if source == 'reports_view':
186
193
  collections = self.source_map[source]
@@ -199,7 +206,8 @@ class FilingOfficialProcessor:
199
206
  self.all_stat_count['all'] += 1
200
207
  for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
201
208
  perm_id_set.add(orbit_entity_id)
202
- process_data.append(self.data_processor.process(doc, check_doc))
209
+ result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step)
210
+ process_data.append(result_record)
203
211
  if len(process_data) >= self.max_batch_size:
204
212
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
205
213
  process_data)
@@ -226,4 +234,5 @@ class FilingOfficialProcessor:
226
234
  process_data.clear()
227
235
  perm_id_set.clear()
228
236
 
229
- logger.info(f"finish processing {self.all_stat_count}.")
237
+ logger.info(f"finish processing {self.all_stat_count}. \n")
238
+ self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
@@ -27,9 +27,9 @@ class FlowUpdater:
27
27
  setattr(self, collection_name, self.coon[data_source])
28
28
  return getattr(self, collection_name)
29
29
 
30
- def _handle_convert(self, status, attachments, db_store_path_set):
31
- if not attachments:
32
- raise ValueError("No attachments provided.")
30
+ def _handle_convert(self, status, attachments, db_store_path_set, attachments_pdf):
31
+ if not attachments or not attachments_pdf:
32
+ raise ValueError("Missing attachments: neither 'attachments' nor 'attachments_pdf' was provided.")
33
33
  if not status:
34
34
  return {
35
35
  'x_status_list.status_convert.status': 'convert_failed',
@@ -38,15 +38,18 @@ class FlowUpdater:
38
38
  }
39
39
 
40
40
  store_path_set = set()
41
+ parent_id_store_path_map = {i['parent_id']: i['store_path'] for i in attachments_pdf}
41
42
  x_attachments_pdf = []
42
43
  for item in attachments:
43
44
  store_path = item['store_path']
45
+ parent_id = item['id']
44
46
  if store_path not in db_store_path_set:
45
47
  raise ValueError(f"store_path not found in db: {store_path}")
46
48
  if store_path in store_path_set:
47
49
  continue
48
50
  store_path_set.add(store_path)
49
- new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
51
+ # new_store_path = store_path if store_path.lower().endswith('.pdf') else store_path + '.pdf'
52
+ new_store_path = parent_id_store_path_map[parent_id]
50
53
  x_attachments_pdf.append({
51
54
  "store_path": new_store_path,
52
55
  "store_path_txt": "",
@@ -76,11 +79,11 @@ class FlowUpdater:
76
79
  return {}
77
80
  return {'x_status_list.status_convert.status_txt': 'convert_txt_embedding'}
78
81
 
79
- def _step_handle(self, step_stage, status, attachments, db_store_path):
82
+ def _step_handle(self, step_stage, status, attachments, db_store_path, attachments_pdf):
80
83
  method_name = f"_handle_{step_stage}"
81
84
  method = getattr(self, method_name, None)
82
85
  if method:
83
- return method(status, attachments=attachments,
86
+ return method(status, attachments=attachments, attachments_pdf=attachments_pdf,
84
87
  db_store_path_set=db_store_path) if step_stage == 'convert' else method(status)
85
88
  else:
86
89
  raise ValueError(f"Unknown step_stage: {step_stage}")
@@ -102,6 +105,7 @@ class FlowUpdater:
102
105
  current_stage = op_meta_record['current_stage']
103
106
  target_stage = op_meta_record['target_stage']
104
107
  attachments = op_meta_record['x_attachments']
108
+ attachments_pdf = op_meta_record['x_attachments_pdf']
105
109
  data_source = op_meta_record['data_source']
106
110
 
107
111
  # 校验参数
@@ -148,7 +152,7 @@ class FlowUpdater:
148
152
  if step == end_stage and status == 'failed':
149
153
  step_status = False
150
154
  logger.info(f' Processing step-{index} {step} - {"successfully" if step_status else "failed"}.')
151
- item = self._step_handle(step, step_status, attachments, db_store_path)
155
+ item = self._step_handle(step, step_status, attachments, db_store_path, attachments_pdf)
152
156
  update_params.update(item)
153
157
 
154
158
  # 执行更新
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.41
3
+ Version: 0.8.43
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.41
File without changes
File without changes
File without changes
File without changes
File without changes