orbitkit 0.8.38__tar.gz → 0.8.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {orbitkit-0.8.38/orbitkit.egg-info → orbitkit-0.8.39}/PKG-INFO +1 -1
  2. orbitkit-0.8.39/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/airflow_handler/data_preprocessing.py +15 -2
  4. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_flow_entry_process.py +2 -2
  5. {orbitkit-0.8.38 → orbitkit-0.8.39/orbitkit.egg-info}/PKG-INFO +1 -1
  6. orbitkit-0.8.38/orbitkit/VERSION +0 -1
  7. {orbitkit-0.8.38 → orbitkit-0.8.39}/LICENSE +0 -0
  8. {orbitkit-0.8.38 → orbitkit-0.8.39}/MANIFEST.in +0 -0
  9. {orbitkit-0.8.38 → orbitkit-0.8.39}/README.md +0 -0
  10. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/__init__.py +0 -0
  11. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/airflow_handler/__init__.py +0 -0
  12. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  13. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_handler.py +0 -0
  14. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  15. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/audio_transcoder/__init__.py +0 -0
  16. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  17. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/constant/__init__.py +0 -0
  18. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/constant/report_schema.py +0 -0
  19. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/id_srv/__init__.py +0 -0
  20. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/id_srv/id_gen.py +0 -0
  21. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/id_srv/id_perm_like.py +0 -0
  22. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/lark_send/__init__.py +0 -0
  23. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/lark_send/lark.py +0 -0
  24. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/llm_tools/__init__.py +0 -0
  25. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  26. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/orbit_type/__init__.py +0 -0
  27. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  28. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  29. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/orbit_type/tools.py +0 -0
  30. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_embedding/__init__.py +0 -0
  31. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  32. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  33. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/__init__.py +0 -0
  34. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  35. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/exceptions.py +0 -0
  36. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  37. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  38. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  39. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  40. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  41. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  42. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  43. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  44. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/cache_asset_downloader.py +0 -0
  56. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_aliyun.py +0 -0
  60. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  61. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_aws.py +0 -0
  62. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  63. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_date.py +0 -0
  64. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_html.py +0 -0
  65. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_kafka.py +0 -0
  66. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_md5.py +0 -0
  67. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_selenium.py +0 -0
  68. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_simple_timer.py +0 -0
  69. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_str.py +0 -0
  70. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_type_mapping.py +0 -0
  71. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit/util/util_url.py +0 -0
  72. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit.egg-info/SOURCES.txt +0 -0
  73. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit.egg-info/dependency_links.txt +0 -0
  74. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit.egg-info/not-zip-safe +0 -0
  75. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit.egg-info/requires.txt +0 -0
  76. {orbitkit-0.8.38 → orbitkit-0.8.39}/orbitkit.egg-info/top_level.txt +0 -0
  77. {orbitkit-0.8.38 → orbitkit-0.8.39}/setup.cfg +0 -0
  78. {orbitkit-0.8.38 → orbitkit-0.8.39}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.38
3
+ Version: 0.8.39
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.39
@@ -102,6 +102,15 @@ class DocumentProcessor:
102
102
  report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
103
103
  return "extract" if report_type_ids == ['19999'] else target_stage
104
104
 
105
+ @staticmethod
106
+ def update_target_stage_for_reported_at(doc, target_stage):
107
+ date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
108
+ try:
109
+ reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
110
+ except ValueError:
111
+ reported_date = datetime.datetime(1970, 1, 1)
112
+ return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
113
+
105
114
  @classmethod
106
115
  def create_record(cls, doc, start_stage):
107
116
  attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
@@ -141,7 +150,7 @@ class DocumentProcessor:
141
150
  }
142
151
 
143
152
  @classmethod
144
- def process(cls, doc):
153
+ def process(cls, doc, check_doc):
145
154
  report_id = doc['_id']
146
155
  # 筛选文件
147
156
  doc = cls.stock_us_filter_by_is_primary(doc)
@@ -156,7 +165,11 @@ class DocumentProcessor:
156
165
  return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
157
166
 
158
167
  start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
159
- target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
168
+
169
+ # 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
170
+ if target_stage == 'embedding' and check_doc:
171
+ target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
172
+ target_stage = cls.update_target_stage_for_reported_at(doc, target_stage)
160
173
 
161
174
  if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
162
175
  return cls.create_result_info("step_error",
@@ -180,7 +180,7 @@ class FilingOfficialProcessor:
180
180
 
181
181
  def process_task_entry(self, source: Literal["filing_data", "filing_reports_astock_test0822", "reports_view"],
182
182
  query: dict, tags: list[str], priority: str,
183
- is_important: bool = False):
183
+ is_important: bool = False, check_doc: bool = True):
184
184
 
185
185
  if source == 'reports_view':
186
186
  collections = self.source_map[source]
@@ -199,7 +199,7 @@ class FilingOfficialProcessor:
199
199
  self.all_stat_count['all'] += 1
200
200
  for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
201
201
  perm_id_set.add(orbit_entity_id)
202
- process_data.append(self.data_processor.process(doc))
202
+ process_data.append(self.data_processor.process(doc, check_doc))
203
203
  if len(process_data) >= self.max_batch_size:
204
204
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
205
205
  process_data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.38
3
+ Version: 0.8.39
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.38
File without changes
File without changes
File without changes
File without changes
File without changes