orbitkit 0.8.38__tar.gz → 0.8.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {orbitkit-0.8.38/orbitkit.egg-info → orbitkit-0.8.40}/PKG-INFO +1 -1
  2. orbitkit-0.8.40/orbitkit/VERSION +1 -0
  3. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/airflow_handler/data_preprocessing.py +30 -3
  4. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/airflow_handler/file_flow_entry_process.py +2 -2
  5. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/airflow_handler/file_handler_v2.py +7 -6
  6. {orbitkit-0.8.38 → orbitkit-0.8.40/orbitkit.egg-info}/PKG-INFO +1 -1
  7. orbitkit-0.8.38/orbitkit/VERSION +0 -1
  8. {orbitkit-0.8.38 → orbitkit-0.8.40}/LICENSE +0 -0
  9. {orbitkit-0.8.38 → orbitkit-0.8.40}/MANIFEST.in +0 -0
  10. {orbitkit-0.8.38 → orbitkit-0.8.40}/README.md +0 -0
  11. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/__init__.py +0 -0
  12. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/airflow_handler/__init__.py +0 -0
  13. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
  14. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/airflow_handler/file_handler.py +0 -0
  15. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/audio_transcoder/__init__.py +0 -0
  16. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
  17. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/constant/__init__.py +0 -0
  18. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/constant/report_schema.py +0 -0
  19. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/id_srv/__init__.py +0 -0
  20. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/id_srv/id_gen.py +0 -0
  21. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/id_srv/id_perm_like.py +0 -0
  22. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/lark_send/__init__.py +0 -0
  23. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/lark_send/lark.py +0 -0
  24. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/llm_tools/__init__.py +0 -0
  25. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
  26. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/orbit_type/__init__.py +0 -0
  27. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
  28. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
  29. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/orbit_type/tools.py +0 -0
  30. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_embedding/__init__.py +0 -0
  31. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
  32. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
  33. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/__init__.py +0 -0
  34. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
  35. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/exceptions.py +0 -0
  36. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
  37. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
  38. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
  39. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
  40. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
  41. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
  42. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
  43. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
  44. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
  45. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
  46. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/base.py +0 -0
  47. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
  48. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/core.py +0 -0
  49. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
  50. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
  51. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_extractor_simple/utils.py +0 -0
  52. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_writer/__init__.py +0 -0
  53. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
  54. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/__init__.py +0 -0
  55. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/cache_asset_downloader.py +0 -0
  56. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/common.py +0 -0
  57. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/customize_regix_manager.py +0 -0
  58. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/secret_manager.py +0 -0
  59. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_aliyun.py +0 -0
  60. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
  61. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_aws.py +0 -0
  62. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
  63. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_date.py +0 -0
  64. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_html.py +0 -0
  65. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_kafka.py +0 -0
  66. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_md5.py +0 -0
  67. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_selenium.py +0 -0
  68. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_simple_timer.py +0 -0
  69. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_str.py +0 -0
  70. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_type_mapping.py +0 -0
  71. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit/util/util_url.py +0 -0
  72. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit.egg-info/SOURCES.txt +0 -0
  73. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit.egg-info/dependency_links.txt +0 -0
  74. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit.egg-info/not-zip-safe +0 -0
  75. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit.egg-info/requires.txt +0 -0
  76. {orbitkit-0.8.38 → orbitkit-0.8.40}/orbitkit.egg-info/top_level.txt +0 -0
  77. {orbitkit-0.8.38 → orbitkit-0.8.40}/setup.cfg +0 -0
  78. {orbitkit-0.8.38 → orbitkit-0.8.40}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.38
3
+ Version: 0.8.40
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -0,0 +1 @@
1
+ 0.8.40
@@ -1,9 +1,13 @@
1
1
  import os
2
2
  import datetime
3
3
  from collections import defaultdict
4
+ import asyncio
5
+ from googletrans import Translator
4
6
 
5
7
 
6
8
  class DocumentProcessor:
9
+ _translator = Translator()
10
+
7
11
  AUDIO_SUFFIXES = [".mp3", ".wav", ".aac", ".wma", ".m4a"]
8
12
  VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
9
13
  PDF_SUFFIXES = [".pdf"]
@@ -17,6 +21,15 @@ class DocumentProcessor:
17
21
  def get_file_suffix(file_path):
18
22
  return f".{file_path.split('.')[-1]}".lower()
19
23
 
24
+ @classmethod
25
+ async def translate(cls, text, dest='en'):
26
+ result = await cls._translator.translate(text, dest=dest)
27
+ return result.text
28
+
29
+ @classmethod
30
+ def translate_sync(cls, text, dest='en'):
31
+ return asyncio.run(cls.translate(text, dest))
32
+
20
33
  @staticmethod
21
34
  def create_xbrl_template():
22
35
  return {
@@ -102,6 +115,15 @@ class DocumentProcessor:
102
115
  report_type_ids = doc.get('x_orbit_data', {}).get('report_type_id_list', [])
103
116
  return "extract" if report_type_ids == ['19999'] else target_stage
104
117
 
118
+ @staticmethod
119
+ def update_target_stage_for_reported_at(doc, target_stage):
120
+ date_str = doc.get('x_reported_at_utc_date', '1970-01-01')
121
+ try:
122
+ reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
123
+ except ValueError:
124
+ reported_date = datetime.datetime(1970, 1, 1)
125
+ return "extract" if reported_date < datetime.datetime(2020, 1, 1) else target_stage
126
+
105
127
  @classmethod
106
128
  def create_record(cls, doc, start_stage):
107
129
  attachments = doc.get('x_attachments', []) if start_stage == 'convert' else doc.get('x_attachments_pdf', [])
@@ -115,7 +137,8 @@ class DocumentProcessor:
115
137
  _, ext = os.path.splitext(att['store_path'])
116
138
  if ext in cls.AUDIO_SUFFIXES or ext in cls.VIDEO_SUFFIXES:
117
139
  add_extends = {
118
- "title": doc['x_orbit_data']['report_title'],
140
+ "original_title": doc['x_orbit_data']['report_title'],
141
+ "title": cls.translate_sync(doc['x_orbit_data']['report_title']),
119
142
  "published": doc['x_reported_at_utc_date'],
120
143
  "tickers": [],
121
144
  "perm_id_list": doc['x_orbit_data']['perm_id_list'],
@@ -141,7 +164,7 @@ class DocumentProcessor:
141
164
  }
142
165
 
143
166
  @classmethod
144
- def process(cls, doc):
167
+ def process(cls, doc, check_doc):
145
168
  report_id = doc['_id']
146
169
  # 筛选文件
147
170
  doc = cls.stock_us_filter_by_is_primary(doc)
@@ -156,7 +179,11 @@ class DocumentProcessor:
156
179
  return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
157
180
 
158
181
  start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc)
159
- target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
182
+
183
+ # 特殊条件下的数据不做embedding ('19999'类型和报告日期小于2020-01-01)
184
+ if target_stage == 'embedding' and check_doc:
185
+ target_stage = cls.update_target_stage_for_report_type(doc, target_stage)
186
+ target_stage = cls.update_target_stage_for_reported_at(doc, target_stage)
160
187
 
161
188
  if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
162
189
  return cls.create_result_info("step_error",
@@ -180,7 +180,7 @@ class FilingOfficialProcessor:
180
180
 
181
181
  def process_task_entry(self, source: Literal["filing_data", "filing_reports_astock_test0822", "reports_view"],
182
182
  query: dict, tags: list[str], priority: str,
183
- is_important: bool = False):
183
+ is_important: bool = False, check_doc: bool = True):
184
184
 
185
185
  if source == 'reports_view':
186
186
  collections = self.source_map[source]
@@ -199,7 +199,7 @@ class FilingOfficialProcessor:
199
199
  self.all_stat_count['all'] += 1
200
200
  for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
201
201
  perm_id_set.add(orbit_entity_id)
202
- process_data.append(self.data_processor.process(doc))
202
+ process_data.append(self.data_processor.process(doc, check_doc))
203
203
  if len(process_data) >= self.max_batch_size:
204
204
  file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
205
205
  process_data)
@@ -5,7 +5,8 @@ from contextlib import contextmanager
5
5
  from typing import List, Dict, Tuple, Any, Optional, Union
6
6
  from sqlalchemy.dialects.postgresql import array
7
7
  from sqlalchemy.orm import sessionmaker, scoped_session
8
- from sqlalchemy import create_engine, MetaData, Table, select, Column, Integer, insert, and_, text, delete, update, desc
8
+ from sqlalchemy import create_engine, MetaData, Table, select, Column, Integer, insert, and_, text, delete, update, \
9
+ desc, LABEL_STYLE_TABLENAME_PLUS_COL
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
@@ -397,9 +398,9 @@ class FileFlowHandleV2:
397
398
  select(op_meta)
398
399
  .where(and_(*conditions)) if conditions else select(op_meta)
399
400
  )
400
- meta_subquery = meta_subquery.order_by(desc(op_meta.c.priority)).limit(limit_size).subquery()
401
+ meta_subquery = meta_subquery.order_by(desc(op_meta.c.priority)).limit(limit_size).subquery(name="public_op_meta")
401
402
  join_stmt = meta_subquery.outerjoin(op_attachment, meta_subquery.c.id == op_attachment.c.meta_id)
402
- stmt = select(meta_subquery, op_attachment).select_from(join_stmt)
403
+ stmt = (select(meta_subquery, op_attachment).select_from(join_stmt).set_label_style(LABEL_STYLE_TABLENAME_PLUS_COL))
403
404
 
404
405
  with self.session_scope() as session:
405
406
  result = session.execute(stmt).fetchall()
@@ -407,10 +408,10 @@ class FileFlowHandleV2:
407
408
  meta_map = {}
408
409
  for row in result:
409
410
  row_dict = dict(row._mapping)
410
- meta_id = row_dict['id']
411
+ meta_id = row_dict['public_op_meta_id']
411
412
 
412
- meta_fields: Dict[str, Any] = {k: v for k, v in row_dict.items() if k in op_meta.c}
413
- attachment_fields: Dict[str, Any] = {k: v for k, v in row_dict.items() if k in op_attachment.c}
413
+ meta_fields: Dict[str, Any] = {k.replace('public_op_meta_', ''): v for k, v in row_dict.items() if k.startswith('public_op_meta_')}
414
+ attachment_fields: Dict[str, Any] = {k.replace('public_op_attachment_', ''): v for k, v in row_dict.items() if k.startswith('public_op_attachment_')}
414
415
 
415
416
  meta_fields['created_at'] = meta_fields['created_at'].isoformat()
416
417
  meta_fields['updated_at'] = meta_fields['updated_at'].isoformat()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orbitkit
3
- Version: 0.8.38
3
+ Version: 0.8.40
4
4
  Summary: This project is only for Orbit Tech internal use.
5
5
  Home-page: https://github.com/clown-0726/orbitkit
6
6
  Author: Lilu Cao
@@ -1 +0,0 @@
1
- 0.8.38
File without changes
File without changes
File without changes
File without changes
File without changes