orbitkit 0.8.53__tar.gz → 0.8.56__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {orbitkit-0.8.53/orbitkit.egg-info → orbitkit-0.8.56}/PKG-INFO +1 -1
  2. orbitkit-0.8.56/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/airflow_handler/data_preprocessing.py +10 -2
  4. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/airflow_handler/file_flow_entry_process.py +14 -8
  5. {orbitkit-0.8.53 → orbitkit-0.8.56/orbitkit.egg-info}/PKG-INFO +1 -1
  6. orbitkit-0.8.53/orbitkit/VERSION +0 -1
  7. {orbitkit-0.8.53 → orbitkit-0.8.56}/LICENSE +0 -0
  8. {orbitkit-0.8.53 → orbitkit-0.8.56}/MANIFEST.in +0 -0
  9. {orbitkit-0.8.53 → orbitkit-0.8.56}/README.md +0 -0
  10. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/__init__.py +0 -0
  11. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/airflow_handler/__init__.py +0 -0
  12. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  13. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/airflow_handler/file_handler.py +0 -0
  14. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/airflow_handler/file_handler_v2.py +0 -0
  15. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/audio_transcoder/__init__.py +0 -0
  16. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  17. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/constant/__init__.py +0 -0
  18. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/constant/report_schema.py +0 -0
  19. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/id_srv/__init__.py +0 -0
  20. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/id_srv/id_gen.py +0 -0
  21. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/id_srv/id_perm_like.py +0 -0
  22. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/lark_send/__init__.py +0 -0
  23. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/lark_send/lark.py +0 -0
  24. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/llm_tools/__init__.py +0 -0
  25. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  26. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/orbit_type/__init__.py +0 -0
  27. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  28. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  29. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/orbit_type/tools.py +0 -0
  30. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_embedding/__init__.py +0 -0
  31. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  32. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  33. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/__init__.py +0 -0
  34. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  35. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/exceptions.py +0 -0
  36. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  37. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  38. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  39. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  40. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  41. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  42. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  43. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  44. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/cache_asset_downloader.py +0 -0
  56. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/universal_extractor.py +0 -0
  60. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_aliyun.py +0 -0
  61. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  62. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_aws.py +0 -0
  63. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  64. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_date.py +0 -0
  65. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_html.py +0 -0
  66. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_kafka.py +0 -0
  67. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_md5.py +0 -0
  68. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_selenium.py +0 -0
  69. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_simple_timer.py +0 -0
  70. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_str.py +0 -0
  71. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_type_mapping.py +0 -0
  72. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit/util/util_url.py +0 -0
  73. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit.egg-info/SOURCES.txt +0 -0
  74. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit.egg-info/dependency_links.txt +0 -0
  75. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit.egg-info/not-zip-safe +0 -0
  76. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit.egg-info/requires.txt +0 -0
  77. {orbitkit-0.8.53 → orbitkit-0.8.56}/orbitkit.egg-info/top_level.txt +0 -0
  78. {orbitkit-0.8.53 → orbitkit-0.8.56}/setup.cfg +0 -0
  79. {orbitkit-0.8.53 → orbitkit-0.8.56}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.53
3
+ Version: 0.8.56
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.56
@@ -68,7 +68,9 @@ class DocumentProcessor:
68
68
  if not doc:
69
69
  return None
70
70
  suffixes = {cls.get_file_suffix(att['store_path']) for att in doc.get('x_attachments', [])}
71
- return doc if suffixes.issubset(cls.ALL_ALLOWED_SUFFIXES) else None
71
+ if suffixes.issubset(cls.ALL_ALLOWED_SUFFIXES) or doc['x_status_list']['status_convert']['status'] == "convert_done":
72
+ return doc
73
+ return None
72
74
 
73
75
  @classmethod
74
76
  def xbrl_type_check(cls, doc):
@@ -126,6 +128,11 @@ class DocumentProcessor:
126
128
  reported_date = datetime.datetime(1970, 1, 1)
127
129
  return "extract" if reported_date < datetime.datetime(2023, 1, 1) else target_stage
128
130
 
131
+ @staticmethod
132
+ def update_target_stage_by_perm_match(doc, target_stage):
133
+ perm_match_status = doc['x_status_list']['status_perm']['status']
134
+ return target_stage if perm_match_status in {'perm_match_part', 'perm_match'} else "extract"
135
+
129
136
  @classmethod
130
137
  async def create_record(cls, doc, start_stage, important_level):
131
138
  attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
@@ -182,10 +189,11 @@ class DocumentProcessor:
182
189
 
183
190
  start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
184
191
 
185
- # 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
192
+ # 判断 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2023-01-01)
186
193
  if target_stage == 'embedding' and not custom_process_step:
187
194
  target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
188
195
  target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
196
+ target_stage = cls.update_target_stage_by_perm_match(doc, target_stage)
189
197
  # 特殊情况下只需要做embedding 但是这个数据被条件限制为只做到提取时状态异常
190
198
  if start_stage == 'embedding' and target_stage == 'extract':
191
199
  start_stage = 'success'
@@ -112,14 +112,20 @@ class FilingOfficialProcessor:
112
112
  def update_doc_status_to_convert(self, collection, report_id_list):
113
113
  if len(report_id_list) == 0:
114
114
  return
115
- collection.update_many({
116
- '_id': {'$in': report_id_list}
117
- }, {'$set': {
118
- "x_status_list.status_convert.status": "convert_failed",
119
- "x_status_list.status_convert.status_txt": "convert_txt_init",
120
- "x_status_list.status_convert.status_meta": "meta_init",
121
- "x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
122
- }})
115
+ collection.update_many(
116
+ {"_id": {"$in": report_id_list}},
117
+ {
118
+ "$set": {
119
+ "x_status_list.status_convert.status": "convert_failed",
120
+ "x_status_list.status_convert.status_txt": "convert_txt_init",
121
+ "x_status_list.status_convert.status_meta": "meta_init",
122
+ "x_updated_date": datetime.now(tz=pytz.timezone('UTC')).strftime("%Y-%m-%dT%H:%M:%S%z"),
123
+ },
124
+ "$unset": {
125
+ "x_attachments_pdf": ""
126
+ }
127
+ }
128
+ )
123
129
  logger.info(f'Unable to convert {len(report_id_list)} document(s) due to unsupported file type.')
124
130
 
125
131
  def update_extends_fields(self, perm_id_list, file_flow_info):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orbitkit
3
- Version: 0.8.53
3
+ Version: 0.8.56
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.53
File without changes
File without changes
File without changes
File without changes
File without changes