orbitkit 0.8.48__tar.gz → 0.8.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {orbitkit-0.8.48/orbitkit.egg-info → orbitkit-0.8.50}/PKG-INFO +2 -2
  2. orbitkit-0.8.50/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/airflow_handler/data_preprocessing.py +8 -7
  4. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/airflow_handler/file_flow_entry_process.py +78 -54
  5. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/airflow_handler/file_handler_v2.py +11 -6
  6. {orbitkit-0.8.48 → orbitkit-0.8.50/orbitkit.egg-info}/PKG-INFO +2 -2
  7. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit.egg-info/requires.txt +1 -1
  8. {orbitkit-0.8.48 → orbitkit-0.8.50}/setup.py +1 -1
  9. orbitkit-0.8.48/orbitkit/VERSION +0 -1
  10. {orbitkit-0.8.48 → orbitkit-0.8.50}/LICENSE +0 -0
  11. {orbitkit-0.8.48 → orbitkit-0.8.50}/MANIFEST.in +0 -0
  12. {orbitkit-0.8.48 → orbitkit-0.8.50}/README.md +0 -0
  13. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/__init__.py +0 -0
  14. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/airflow_handler/__init__.py +0 -0
  15. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  16. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/airflow_handler/file_handler.py +0 -0
  17. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/audio_transcoder/__init__.py +0 -0
  18. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  19. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/constant/__init__.py +0 -0
  20. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/constant/report_schema.py +0 -0
  21. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/id_srv/__init__.py +0 -0
  22. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/id_srv/id_gen.py +0 -0
  23. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/id_srv/id_perm_like.py +0 -0
  24. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/lark_send/__init__.py +0 -0
  25. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/lark_send/lark.py +0 -0
  26. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/llm_tools/__init__.py +0 -0
  27. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  28. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/orbit_type/__init__.py +0 -0
  29. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  30. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  31. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/orbit_type/tools.py +0 -0
  32. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_embedding/__init__.py +0 -0
  33. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  34. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  35. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/__init__.py +0 -0
  36. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  37. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/exceptions.py +0 -0
  38. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  39. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  40. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  41. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  42. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  43. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  44. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  45. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  46. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  47. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  48. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/base.py +0 -0
  49. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  50. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/core.py +0 -0
  51. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  52. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  53. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  54. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_writer/__init__.py +0 -0
  55. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  56. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/__init__.py +0 -0
  57. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/cache_asset_downloader.py +0 -0
  58. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/common.py +0 -0
  59. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/customize_regix_manager.py +0 -0
  60. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/secret_manager.py +0 -0
  61. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_aliyun.py +0 -0
  62. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  63. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_aws.py +0 -0
  64. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  65. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_date.py +0 -0
  66. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_html.py +0 -0
  67. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_kafka.py +0 -0
  68. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_md5.py +0 -0
  69. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_selenium.py +0 -0
  70. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_simple_timer.py +0 -0
  71. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_str.py +0 -0
  72. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_type_mapping.py +0 -0
  73. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit/util/util_url.py +0 -0
  74. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit.egg-info/SOURCES.txt +0 -0
  75. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit.egg-info/dependency_links.txt +0 -0
  76. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit.egg-info/not-zip-safe +0 -0
  77. {orbitkit-0.8.48 → orbitkit-0.8.50}/orbitkit.egg-info/top_level.txt +0 -0
  78. {orbitkit-0.8.48 → orbitkit-0.8.50}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.48
3
+ Version: 0.8.50
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -33,7 +33,7 @@ Requires-Dist: boto3>=1.40.46
33
33
  Requires-Dist: aioboto3>=15.5.0
34
34
  Requires-Dist: aiofiles>=25.1.0
35
35
  Requires-Dist: requests>=2.32.5
36
- Requires-Dist: prettytable>=3.17.0
36
+ Requires-Dist: prettytable>=3.16.0
37
37
  Requires-Dist: pytz>=2025.2
38
38
  Requires-Dist: Deprecated
39
39
  Requires-Dist: func_timeout
@@ -0,0 +1 @@
1
+ 0.8.50
@@ -12,7 +12,7 @@ class DocumentProcessor:
12
12
  VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
13
13
  PDF_SUFFIXES = [".pdf"]
14
14
  DOC_SUFFIXES = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]
15
- TXT_SUFFIXES = [".txt", ".html", ".htm"]
15
+ TXT_SUFFIXES = [".txt", ".html", ".htm", ".xhtml"]
16
16
  ALL_ALLOWED_SUFFIXES = set(AUDIO_SUFFIXES + VIDEO_SUFFIXES + PDF_SUFFIXES + DOC_SUFFIXES + TXT_SUFFIXES)
17
17
 
18
18
  DATA_PROCESS_STEPS = ['convert', 'extract', 'embedding', 'success']
@@ -74,9 +74,10 @@ class DocumentProcessor:
74
74
  def xbrl_type_check(cls, doc):
75
75
  is_xbrl = doc.get('x_info_data', {}).get('is_xbrl') == 'true'
76
76
  x_attachments = doc.get('x_attachments', [])
77
+ convert_status = doc.get('x_status_list', {}).get('status_convert', {}).get('status')
77
78
  xhtml_count = sum(1 for att in x_attachments if att['store_path'].lower().endswith('.xhtml'))
78
79
 
79
- if is_xbrl or xhtml_count > 0:
80
+ if is_xbrl or xhtml_count > 0 and convert_status != 'convert_done':
80
81
  template = cls.create_xbrl_template()
81
82
  template['_id'] = doc['_id']
82
83
  template['source_type'] = doc.get('x_report_source', {}).get('source_type', '')
@@ -123,7 +124,7 @@ class DocumentProcessor:
123
124
  reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
124
125
  except ValueError:
125
126
  reported_date = datetime.datetime(1970, 1, 1)
126
- return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
127
+ return "extract" if reported_date < datetime.datetime(2023, 1, 1) else target_stage
127
128
 
128
129
  @classmethod
129
130
  async def create_record(cls, doc, start_stage, important_level):
@@ -177,7 +178,7 @@ class DocumentProcessor:
177
178
 
178
179
  is_xbrl, xbrl_data = cls.xbrl_type_check(doc)
179
180
  if is_xbrl:
180
- return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
181
+ return cls.create_result_info("xbrl", "XBRL or Xhtml format cannot be processed.", xbrl_data)
181
182
 
182
183
  start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
183
184
 
@@ -185,7 +186,7 @@ class DocumentProcessor:
185
186
  if target_stage == 'embedding' and not custom_process_step:
186
187
  target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
187
188
  target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
188
- # 特殊情况下只需要做提取但是这个数据被过滤不需要做embedding
189
+ # 特殊情况下只需要做embedding 但是这个数据被条件限制为只做到提取时状态异常
189
190
  if start_stage == 'embedding' and target_stage == 'extract':
190
191
  start_stage = 'success'
191
192
  target_stage = 'success'
@@ -193,11 +194,11 @@ class DocumentProcessor:
193
194
  if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
194
195
  return cls.create_result_info("step_error",
195
196
  "Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
196
- doc['_id'])
197
+ report_id)
197
198
 
198
199
  file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
199
200
  if not file_name_check_status:
200
- return cls.create_result_info("step_error", "Document file name too lang.", report_id)
201
+ return cls.create_result_info("error", "Document file name too lang.", report_id)
201
202
 
202
203
  return cls.create_result_info("file_flow", "Success", [start_stage, target_stage, x_spider_name, record])
203
204
 
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from collections import Counter
3
3
  from datetime import datetime
4
- from typing import Literal, Optional
4
+ from typing import Optional
5
5
  import logging
6
6
  import pymongo
7
7
  import pytz
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
19
19
 
20
20
  class FilingOfficialProcessor:
21
21
 
22
- def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None):
22
+ def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None, databases_fileflow=None):
23
23
  mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
24
24
  if not mongo_uri:
25
25
  raise KeyError('mongo_uri not set.')
@@ -29,23 +29,15 @@ class FilingOfficialProcessor:
29
29
 
30
30
  self.mongo_client = pymongo.MongoClient(mongo_uri)
31
31
  self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
32
- self.filing_data_collection = self.mongo_client['filing_reports']['filing_data']
33
- self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
34
- 'annotation_reports_view_rows']
35
- self.source_map = {
36
- 'filing_data': (self.filing_data_collection, 'filing_data'),
37
- 'G7_demo': (self.filing_data_collection, 'G7_demo'),
38
- 'reports_view': [
39
- (self.filing_data_collection, 'filing_data')
40
- ]
41
- }
32
+
42
33
  postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
43
34
  if not postgres_uri:
44
35
  raise KeyError('postgres_uri not set.')
45
- self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri)
36
+ databases_fileflow = databases_fileflow or "process_net"
37
+ self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri, database_name=databases_fileflow)
46
38
  self.data_processor = DocumentProcessor()
47
39
  self.max_batch_size = 10000
48
- self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
40
+ self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
49
41
 
50
42
  self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
51
43
  self.matcher = OrbitTypeMatcher(self.s3_client)
@@ -65,9 +57,19 @@ class FilingOfficialProcessor:
65
57
  autoload_with=self.postgres_engine, schema='security_master'
66
58
  )
67
59
 
60
+ self.postgres_engine2 = create_engine(f"{postgres_uri}/{databases_fileflow}",
61
+ connect_args={"sslmode": "require"})
62
+ self.postgres_session2 = sessionmaker(bind=self.postgres_engine2)
63
+ self.Session2 = scoped_session(self.postgres_session2)
64
+
65
+ self.op_meta = Table(
66
+ 'op_meta', self.postgres_metadata,
67
+ autoload_with=self.postgres_engine2, schema='public'
68
+ )
69
+
68
70
  @contextmanager
69
- def session_scope(self):
70
- session = self.Session()
71
+ def session_scope(self, use_session=None):
72
+ session = self.Session() if not use_session else use_session
71
73
  try:
72
74
  yield session
73
75
  session.commit()
@@ -77,7 +79,7 @@ class FilingOfficialProcessor:
77
79
  finally:
78
80
  self.Session.remove()
79
81
 
80
- def create_spider_name_source_type_map(self, collections):
82
+ def create_spider_name_source_type_map(self, collection, label):
81
83
 
82
84
  def find_duplicates(keys):
83
85
  return [k for k, v in Counter(keys).items() if v > 1]
@@ -85,9 +87,8 @@ class FilingOfficialProcessor:
85
87
  map_dict = {}
86
88
  pipeline = [{'$group': {'_id': "$x_spider_name"}}]
87
89
 
88
- for collection, label in collections:
89
- for document in collection.aggregate(pipeline):
90
- map_dict[document['_id']] = label
90
+ for document in collection.aggregate(pipeline):
91
+ map_dict[document['_id']] = label
91
92
 
92
93
  all_keys = list(map_dict.keys())
93
94
  duplicates = find_duplicates(all_keys)
@@ -174,10 +175,39 @@ class FilingOfficialProcessor:
174
175
  self.all_stat_count['file_flow'] += len(records)
175
176
  logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
176
177
 
178
+ def op_meat_deduplicate_docs(self, docs, buffer_size=1000):
179
+ buffer = []
180
+
181
+ for doc in docs:
182
+ buffer.append(doc)
183
+
184
+ if len(buffer) >= buffer_size:
185
+ doc_ids = [d['_id'] for d in buffer]
186
+ with self.session_scope(use_session=self.Session2) as session:
187
+ existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
188
+ existing_ids = {i[0] for i in existing_ids}
189
+ for buffered_doc in buffer:
190
+ self.all_stat_count['all'] += 1
191
+ if buffered_doc['_id'] not in existing_ids:
192
+ yield buffered_doc
193
+
194
+ buffer.clear()
195
+
196
+ if buffer:
197
+ doc_ids = [d['_id'] for d in buffer]
198
+ with self.session_scope(use_session=self.Session2) as session:
199
+ existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
200
+ existing_ids = {i[0] for i in existing_ids}
201
+ for buffered_doc in buffer:
202
+ self.all_stat_count['all'] += 1
203
+ if buffered_doc['_id'] not in existing_ids:
204
+ yield buffered_doc
205
+
206
+ buffer.clear()
177
207
 
178
- async def process_task_entry(self, source: Literal["filing_data", "reports_view", "G7_demo"],
208
+ async def process_task_entry(self, source: str,
179
209
  query: dict, tags: list[str], priority: str,
180
- is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None):
210
+ is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None, db_name: str = None):
181
211
 
182
212
  if not important_level or not isinstance(important_level, int):
183
213
  important_level = 0
@@ -195,43 +225,24 @@ class FilingOfficialProcessor:
195
225
  if step not in allowed_steps:
196
226
  raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
197
227
 
198
- if source == 'reports_view':
199
- collections = self.source_map[source]
200
- else:
201
- collections = [self.source_map[source]]
202
-
203
- spider_name_source_type = self.create_spider_name_source_type_map(collections)
228
+ collection = self.mongo_client[db_name if db_name else "filing_reports"][source]
229
+ spider_name_source_type = self.create_spider_name_source_type_map(collection, source)
204
230
 
205
231
  process_data = []
206
232
  perm_id_set = set()
207
- for collection, label in collections:
208
- logger.info(f"load {label} data.")
209
- docs = collection.find(query).batch_size(1000)
210
-
211
- for doc in docs:
212
- self.all_stat_count['all'] += 1
213
- for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
214
- perm_id_set.add(orbit_entity_id)
215
- result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
216
- process_data.append(result_record)
217
- if len(process_data) >= self.max_batch_size:
218
- file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
219
- process_data)
220
- file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
221
- self.all_stat_count['skip'] += len(doc_error_list)
222
- self.all_stat_count['step_error'] += len(except_id_list)
223
- self.all_stat_count['xbrl'] += len(xbrl_data)
224
- self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
225
- self.send_xbrl_data_to_mongo(xbrl_data)
226
- self.update_doc_status_to_convert(collection, doc_error_list)
227
- process_data.clear()
228
- perm_id_set.clear()
229
-
230
- if process_data:
233
+ logger.info(f"load {source} data.")
234
+ docs = collection.find(query).batch_size(1000)
235
+ duplicate_docs = self.op_meat_deduplicate_docs(docs, buffer_size=self.max_batch_size) if not is_important else docs
236
+ for doc in duplicate_docs:
237
+ for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
238
+ perm_id_set.add(orbit_entity_id)
239
+ result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
240
+ process_data.append(result_record)
241
+ if len(process_data) >= self.max_batch_size:
231
242
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
232
243
  process_data)
233
244
  file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
234
- self.all_stat_count['skip'] += len(doc_error_list)
245
+ self.all_stat_count['doc_error'] += len(doc_error_list)
235
246
  self.all_stat_count['step_error'] += len(except_id_list)
236
247
  self.all_stat_count['xbrl'] += len(xbrl_data)
237
248
  self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
@@ -240,5 +251,18 @@ class FilingOfficialProcessor:
240
251
  process_data.clear()
241
252
  perm_id_set.clear()
242
253
 
254
+ if process_data:
255
+ file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
256
+ process_data)
257
+ file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
258
+ self.all_stat_count['doc_error'] += len(doc_error_list)
259
+ self.all_stat_count['step_error'] += len(except_id_list)
260
+ self.all_stat_count['xbrl'] += len(xbrl_data)
261
+ self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
262
+ self.send_xbrl_data_to_mongo(xbrl_data)
263
+ self.update_doc_status_to_convert(collection, doc_error_list)
264
+ process_data.clear()
265
+ perm_id_set.clear()
266
+
243
267
  logger.info(f"finish processing {self.all_stat_count}. \n")
244
- self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
268
+ self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
@@ -44,7 +44,7 @@ class FileFlowHandleV2:
44
44
  autoload_with=self.postgres_engine, schema='public'
45
45
  )
46
46
 
47
- self.not_allow_file_type_list = not_allow_file_type_list or ['.xhtml']
47
+ self.not_allow_file_type_list = not_allow_file_type_list or []
48
48
 
49
49
  @contextmanager
50
50
  def session_scope(self):
@@ -94,20 +94,23 @@ class FileFlowHandleV2:
94
94
  result = session.execute(stmt)
95
95
  return [row[0] for row in result.fetchall()]
96
96
 
97
- def _check_records(self, records: List[Dict[str, Any]], clean_exist_data: bool) -> Tuple[bool, List[str], str]:
97
+ def _check_records(self, records: List[Dict[str, Any]], clean_exist_data: bool) -> Tuple[bool, List[str], str, List[str]]:
98
98
  ids = []
99
+ invalidate_ids = []
99
100
  record_count = 0
100
101
  for record in records:
101
102
  record_count += 1
102
103
  is_valid, msg = self._validate_record(record)
103
104
  if not is_valid:
104
- return False, [record.get("id", "unknown")], msg
105
+ invalidate_ids.append(record.get('id', 'unknown'))
106
+ logger.error(f"Validation failed for record {record.get('id', 'unknown')}: {msg}")
107
+ continue
105
108
  ids.append(record["id"])
106
109
 
107
110
  existing_ids = self._get_existing_ids(ids)
108
111
  if not clean_exist_data and len(existing_ids) == record_count:
109
- return False, existing_ids, "No new data has been inserted."
110
- return True, existing_ids, f"Validation complete. total: {len(records)}. {len(existing_ids)} records already exist."
112
+ return False, existing_ids, "No new data has been inserted.", invalidate_ids
113
+ return True, existing_ids, f"Validation complete. total: {len(records)}. {len(existing_ids)} records already exist.", invalidate_ids
111
114
 
112
115
  def _build_insert_data(
113
116
  self, record: Dict[str, Any], params: Dict[str, Any]
@@ -324,7 +327,7 @@ class FileFlowHandleV2:
324
327
  elif not isinstance(records, list):
325
328
  raise ValueError("records must be a dict or list of dicts.")
326
329
 
327
- is_valid, existing_ids, msg = self._check_records(records, clean_exist_data)
330
+ is_valid, existing_ids, msg, invalidate_ids = self._check_records(records, clean_exist_data)
328
331
  if not is_valid:
329
332
  return False, existing_ids, msg
330
333
  logger.info(msg)
@@ -334,6 +337,8 @@ class FileFlowHandleV2:
334
337
  exist_data_ids = set()
335
338
  count = 0
336
339
  for record in records:
340
+ if record['id'] in invalidate_ids:
341
+ continue
337
342
  count += 1
338
343
  if record['id'] in existing_ids:
339
344
  if clean_exist_data:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.48
3
+ Version: 0.8.50
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -33,7 +33,7 @@ Requires-Dist: boto3>=1.40.46
33
33
  Requires-Dist: aioboto3>=15.5.0
34
34
  Requires-Dist: aiofiles>=25.1.0
35
35
  Requires-Dist: requests>=2.32.5
36
- Requires-Dist: prettytable>=3.17.0
36
+ Requires-Dist: prettytable>=3.16.0
37
37
  Requires-Dist: pytz>=2025.2
38
38
  Requires-Dist: Deprecated
39
39
  Requires-Dist: func_timeout
@@ -2,7 +2,7 @@ boto3>=1.40.46
2
2
  aioboto3>=15.5.0
3
3
  aiofiles>=25.1.0
4
4
  requests>=2.32.5
5
- prettytable>=3.17.0
5
+ prettytable>=3.16.0
6
6
  pytz>=2025.2
7
7
  Deprecated
8
8
  func_timeout
@@ -48,7 +48,7 @@ setup(
48
48
  "aioboto3 >= 15.5.0",
49
49
  "aiofiles >= 25.1.0",
50
50
  "requests >= 2.32.5",
51
- "prettytable >= 3.17.0",
51
+ "prettytable >= 3.16.0",
52
52
  "pytz >= 2025.2",
53
53
  "Deprecated",
54
54
  "func_timeout",
@@ -1 +0,0 @@
1
- 0.8.48
File without changes
File without changes
File without changes
File without changes