orbitkit 0.8.48__tar.gz → 0.8.49__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orbitkit-0.8.48/orbitkit.egg-info → orbitkit-0.8.49}/PKG-INFO +2 -15
- orbitkit-0.8.49/orbitkit/VERSION +1 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/data_preprocessing.py +8 -7
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/file_flow_entry_process.py +78 -54
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/file_handler_v2.py +11 -6
- {orbitkit-0.8.48 → orbitkit-0.8.49/orbitkit.egg-info}/PKG-INFO +2 -15
- orbitkit-0.8.48/orbitkit/VERSION +0 -1
- {orbitkit-0.8.48 → orbitkit-0.8.49}/LICENSE +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/MANIFEST.in +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/README.md +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/file_flow_exit_process.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/airflow_handler/file_handler.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/audio_transcoder/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/audio_transcoder/netmind_extract_v1.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/constant/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/constant/report_schema.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/id_srv/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/id_srv/id_gen.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/id_srv/id_perm_like.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/lark_send/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/lark_send/lark.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/llm_tools/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/llm_tools/quick_rag_chat.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/orbit_type/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/orbit_type/doc_4_compile_rule.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/orbit_type/orbit_type_simple.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/orbit_type/tools.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_embedding/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_embedding/pdf_txt_embedding.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_embedding/pdf_txt_embedding_v2.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/a_stock_extractor_v1.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/exceptions.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_block_extractor_base.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_block_extractor_v1.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_block_extractor_v2.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_extractor_azure.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_extractor_minerU_v1.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_extractor_netmind_v1.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_extractor_netmind_v2.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_extractor_netmind_v3.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor/pdf_extractor_orbit.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/base.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/cloud_provider.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/core.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/exceptions.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/extractors.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_extractor_simple/utils.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_writer/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/pdf_writer/pdf_writer_simple.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/__init__.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/cache_asset_downloader.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/common.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/customize_regix_manager.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/secret_manager.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_aliyun.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_aliyun_oss_simple.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_aws.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_aws_s3_wrapper.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_date.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_html.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_kafka.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_md5.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_selenium.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_simple_timer.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_str.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_type_mapping.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit/util/util_url.py +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit.egg-info/SOURCES.txt +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit.egg-info/dependency_links.txt +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit.egg-info/not-zip-safe +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit.egg-info/requires.txt +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/orbitkit.egg-info/top_level.txt +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/setup.cfg +0 -0
- {orbitkit-0.8.48 → orbitkit-0.8.49}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.49
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -37,19 +37,6 @@ Requires-Dist: prettytable>=3.17.0
|
|
|
37
37
|
Requires-Dist: pytz>=2025.2
|
|
38
38
|
Requires-Dist: Deprecated
|
|
39
39
|
Requires-Dist: func_timeout
|
|
40
|
-
Dynamic: author
|
|
41
|
-
Dynamic: author-email
|
|
42
|
-
Dynamic: classifier
|
|
43
|
-
Dynamic: description
|
|
44
|
-
Dynamic: description-content-type
|
|
45
|
-
Dynamic: home-page
|
|
46
|
-
Dynamic: license
|
|
47
|
-
Dynamic: license-file
|
|
48
|
-
Dynamic: maintainer
|
|
49
|
-
Dynamic: maintainer-email
|
|
50
|
-
Dynamic: platform
|
|
51
|
-
Dynamic: requires-dist
|
|
52
|
-
Dynamic: summary
|
|
53
40
|
|
|
54
41
|
# orbitkit
|
|
55
42
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.8.49
|
|
@@ -12,7 +12,7 @@ class DocumentProcessor:
|
|
|
12
12
|
VIDEO_SUFFIXES = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".3gp", ".hevc"]
|
|
13
13
|
PDF_SUFFIXES = [".pdf"]
|
|
14
14
|
DOC_SUFFIXES = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"]
|
|
15
|
-
TXT_SUFFIXES = [".txt", ".html", ".htm"]
|
|
15
|
+
TXT_SUFFIXES = [".txt", ".html", ".htm", ".xhtml"]
|
|
16
16
|
ALL_ALLOWED_SUFFIXES = set(AUDIO_SUFFIXES + VIDEO_SUFFIXES + PDF_SUFFIXES + DOC_SUFFIXES + TXT_SUFFIXES)
|
|
17
17
|
|
|
18
18
|
DATA_PROCESS_STEPS = ['convert', 'extract', 'embedding', 'success']
|
|
@@ -74,9 +74,10 @@ class DocumentProcessor:
|
|
|
74
74
|
def xbrl_type_check(cls, doc):
|
|
75
75
|
is_xbrl = doc.get('x_info_data', {}).get('is_xbrl') == 'true'
|
|
76
76
|
x_attachments = doc.get('x_attachments', [])
|
|
77
|
+
convert_status = doc.get('x_status_list', {}).get('status_convert', {}).get('status')
|
|
77
78
|
xhtml_count = sum(1 for att in x_attachments if att['store_path'].lower().endswith('.xhtml'))
|
|
78
79
|
|
|
79
|
-
if is_xbrl or xhtml_count > 0:
|
|
80
|
+
if is_xbrl or xhtml_count > 0 and convert_status != 'convert_done':
|
|
80
81
|
template = cls.create_xbrl_template()
|
|
81
82
|
template['_id'] = doc['_id']
|
|
82
83
|
template['source_type'] = doc.get('x_report_source', {}).get('source_type', '')
|
|
@@ -123,7 +124,7 @@ class DocumentProcessor:
|
|
|
123
124
|
reported_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
|
124
125
|
except ValueError:
|
|
125
126
|
reported_date = datetime.datetime(1970, 1, 1)
|
|
126
|
-
return "extract" if reported_date < datetime.datetime(
|
|
127
|
+
return "extract" if reported_date < datetime.datetime(2023, 1, 1) else target_stage
|
|
127
128
|
|
|
128
129
|
@classmethod
|
|
129
130
|
async def create_record(cls, doc, start_stage, important_level):
|
|
@@ -177,7 +178,7 @@ class DocumentProcessor:
|
|
|
177
178
|
|
|
178
179
|
is_xbrl, xbrl_data = cls.xbrl_type_check(doc)
|
|
179
180
|
if is_xbrl:
|
|
180
|
-
return cls.create_result_info("xbrl", "XBRL format cannot be processed.", xbrl_data)
|
|
181
|
+
return cls.create_result_info("xbrl", "XBRL or Xhtml format cannot be processed.", xbrl_data)
|
|
181
182
|
|
|
182
183
|
start_stage, target_stage, x_spider_name = cls.get_start_stage_target_stage(doc, custom_process_step)
|
|
183
184
|
|
|
@@ -185,7 +186,7 @@ class DocumentProcessor:
|
|
|
185
186
|
if target_stage == 'embedding' and not custom_process_step:
|
|
186
187
|
target_stage = cls.update_target_stage_by_report_type(doc, target_stage)
|
|
187
188
|
target_stage = cls.update_target_stage_by_reported_at(doc, target_stage)
|
|
188
|
-
#
|
|
189
|
+
# 特殊情况下只需要做embedding 但是这个数据被条件限制为只做到提取时状态异常
|
|
189
190
|
if start_stage == 'embedding' and target_stage == 'extract':
|
|
190
191
|
start_stage = 'success'
|
|
191
192
|
target_stage = 'success'
|
|
@@ -193,11 +194,11 @@ class DocumentProcessor:
|
|
|
193
194
|
if cls.DATA_PROCESS_STEPS.index(target_stage) < cls.DATA_PROCESS_STEPS.index(start_stage):
|
|
194
195
|
return cls.create_result_info("step_error",
|
|
195
196
|
"Invalid process sequence: 'start_stage' occurs before 'target_stage'.",
|
|
196
|
-
|
|
197
|
+
report_id)
|
|
197
198
|
|
|
198
199
|
file_name_check_status, record = await cls.create_record(doc, start_stage, important_level)
|
|
199
200
|
if not file_name_check_status:
|
|
200
|
-
return cls.create_result_info("
|
|
201
|
+
return cls.create_result_info("error", "Document file name too lang.", report_id)
|
|
201
202
|
|
|
202
203
|
return cls.create_result_info("file_flow", "Success", [start_stage, target_stage, x_spider_name, record])
|
|
203
204
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from collections import Counter
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Optional
|
|
5
5
|
import logging
|
|
6
6
|
import pymongo
|
|
7
7
|
import pytz
|
|
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
class FilingOfficialProcessor:
|
|
21
21
|
|
|
22
|
-
def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None):
|
|
22
|
+
def __init__(self, mongo_uri=None, postgres_uri=None, aws_access_key_id=None, aws_secret_access_key=None, pi2_postgres_uri=None, pi2_database_name=None, databases_fileflow=None):
|
|
23
23
|
mongo_uri = os.environ.get('MONGO_URI_MAIN_USER_APP') if not mongo_uri else mongo_uri
|
|
24
24
|
if not mongo_uri:
|
|
25
25
|
raise KeyError('mongo_uri not set.')
|
|
@@ -29,23 +29,15 @@ class FilingOfficialProcessor:
|
|
|
29
29
|
|
|
30
30
|
self.mongo_client = pymongo.MongoClient(mongo_uri)
|
|
31
31
|
self.data_xbrl_convert_collection = self.mongo_client['filing_reports']['data_xbrl_convert']
|
|
32
|
-
|
|
33
|
-
self.annotation_reports_view_rows_collection = self.mongo_client['filing_reports'][
|
|
34
|
-
'annotation_reports_view_rows']
|
|
35
|
-
self.source_map = {
|
|
36
|
-
'filing_data': (self.filing_data_collection, 'filing_data'),
|
|
37
|
-
'G7_demo': (self.filing_data_collection, 'G7_demo'),
|
|
38
|
-
'reports_view': [
|
|
39
|
-
(self.filing_data_collection, 'filing_data')
|
|
40
|
-
]
|
|
41
|
-
}
|
|
32
|
+
|
|
42
33
|
postgres_uri = os.environ.get('PG_URI_AIRFLOW12_USER_NEWSFEEDSITE') if not postgres_uri else postgres_uri
|
|
43
34
|
if not postgres_uri:
|
|
44
35
|
raise KeyError('postgres_uri not set.')
|
|
45
|
-
|
|
36
|
+
databases_fileflow = databases_fileflow or "process_net"
|
|
37
|
+
self.file_handler = FileFlowHandleV2(postgres_uri=postgres_uri, database_name=databases_fileflow)
|
|
46
38
|
self.data_processor = DocumentProcessor()
|
|
47
39
|
self.max_batch_size = 10000
|
|
48
|
-
self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
40
|
+
self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
49
41
|
|
|
50
42
|
self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
|
|
51
43
|
self.matcher = OrbitTypeMatcher(self.s3_client)
|
|
@@ -65,9 +57,19 @@ class FilingOfficialProcessor:
|
|
|
65
57
|
autoload_with=self.postgres_engine, schema='security_master'
|
|
66
58
|
)
|
|
67
59
|
|
|
60
|
+
self.postgres_engine2 = create_engine(f"{postgres_uri}/{databases_fileflow}",
|
|
61
|
+
connect_args={"sslmode": "require"})
|
|
62
|
+
self.postgres_session2 = sessionmaker(bind=self.postgres_engine2)
|
|
63
|
+
self.Session2 = scoped_session(self.postgres_session2)
|
|
64
|
+
|
|
65
|
+
self.op_meta = Table(
|
|
66
|
+
'op_meta', self.postgres_metadata,
|
|
67
|
+
autoload_with=self.postgres_engine2, schema='public'
|
|
68
|
+
)
|
|
69
|
+
|
|
68
70
|
@contextmanager
|
|
69
|
-
def session_scope(self):
|
|
70
|
-
session = self.Session()
|
|
71
|
+
def session_scope(self, use_session=None):
|
|
72
|
+
session = self.Session() if not use_session else use_session
|
|
71
73
|
try:
|
|
72
74
|
yield session
|
|
73
75
|
session.commit()
|
|
@@ -77,7 +79,7 @@ class FilingOfficialProcessor:
|
|
|
77
79
|
finally:
|
|
78
80
|
self.Session.remove()
|
|
79
81
|
|
|
80
|
-
def create_spider_name_source_type_map(self,
|
|
82
|
+
def create_spider_name_source_type_map(self, collection, label):
|
|
81
83
|
|
|
82
84
|
def find_duplicates(keys):
|
|
83
85
|
return [k for k, v in Counter(keys).items() if v > 1]
|
|
@@ -85,9 +87,8 @@ class FilingOfficialProcessor:
|
|
|
85
87
|
map_dict = {}
|
|
86
88
|
pipeline = [{'$group': {'_id': "$x_spider_name"}}]
|
|
87
89
|
|
|
88
|
-
for
|
|
89
|
-
|
|
90
|
-
map_dict[document['_id']] = label
|
|
90
|
+
for document in collection.aggregate(pipeline):
|
|
91
|
+
map_dict[document['_id']] = label
|
|
91
92
|
|
|
92
93
|
all_keys = list(map_dict.keys())
|
|
93
94
|
duplicates = find_duplicates(all_keys)
|
|
@@ -174,10 +175,39 @@ class FilingOfficialProcessor:
|
|
|
174
175
|
self.all_stat_count['file_flow'] += len(records)
|
|
175
176
|
logger.info(f"{len(records)}--{start_stage}-{target_stage}-{x_spider_name} status: {status}, message: {message}")
|
|
176
177
|
|
|
178
|
+
def op_meat_deduplicate_docs(self, docs, buffer_size=1000):
|
|
179
|
+
buffer = []
|
|
180
|
+
|
|
181
|
+
for doc in docs:
|
|
182
|
+
buffer.append(doc)
|
|
183
|
+
|
|
184
|
+
if len(buffer) >= buffer_size:
|
|
185
|
+
doc_ids = [d['_id'] for d in buffer]
|
|
186
|
+
with self.session_scope(use_session=self.Session2) as session:
|
|
187
|
+
existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
|
|
188
|
+
existing_ids = {i[0] for i in existing_ids}
|
|
189
|
+
for buffered_doc in buffer:
|
|
190
|
+
self.all_stat_count['all'] += 1
|
|
191
|
+
if buffered_doc['_id'] not in existing_ids:
|
|
192
|
+
yield buffered_doc
|
|
193
|
+
|
|
194
|
+
buffer.clear()
|
|
195
|
+
|
|
196
|
+
if buffer:
|
|
197
|
+
doc_ids = [d['_id'] for d in buffer]
|
|
198
|
+
with self.session_scope(use_session=self.Session2) as session:
|
|
199
|
+
existing_ids = session.query(self.op_meta.c.id).filter(self.op_meta.c.id.in_(doc_ids)).all()
|
|
200
|
+
existing_ids = {i[0] for i in existing_ids}
|
|
201
|
+
for buffered_doc in buffer:
|
|
202
|
+
self.all_stat_count['all'] += 1
|
|
203
|
+
if buffered_doc['_id'] not in existing_ids:
|
|
204
|
+
yield buffered_doc
|
|
205
|
+
|
|
206
|
+
buffer.clear()
|
|
177
207
|
|
|
178
|
-
async def process_task_entry(self, source:
|
|
208
|
+
async def process_task_entry(self, source: str,
|
|
179
209
|
query: dict, tags: list[str], priority: str,
|
|
180
|
-
is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None):
|
|
210
|
+
is_important: bool = False, custom_step: Optional[list[str]] = None, important_level = None, db_name: str = None):
|
|
181
211
|
|
|
182
212
|
if not important_level or not isinstance(important_level, int):
|
|
183
213
|
important_level = 0
|
|
@@ -195,43 +225,24 @@ class FilingOfficialProcessor:
|
|
|
195
225
|
if step not in allowed_steps:
|
|
196
226
|
raise ValueError(f"Invalid step '{step}'. Allowed steps are: {allowed_steps}")
|
|
197
227
|
|
|
198
|
-
if
|
|
199
|
-
|
|
200
|
-
else:
|
|
201
|
-
collections = [self.source_map[source]]
|
|
202
|
-
|
|
203
|
-
spider_name_source_type = self.create_spider_name_source_type_map(collections)
|
|
228
|
+
collection = self.mongo_client[db_name if db_name else "filing_reports"][source]
|
|
229
|
+
spider_name_source_type = self.create_spider_name_source_type_map(collection, source)
|
|
204
230
|
|
|
205
231
|
process_data = []
|
|
206
232
|
perm_id_set = set()
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
for
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
process_data.append(result_record)
|
|
217
|
-
if len(process_data) >= self.max_batch_size:
|
|
218
|
-
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
219
|
-
process_data)
|
|
220
|
-
file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
|
|
221
|
-
self.all_stat_count['skip'] += len(doc_error_list)
|
|
222
|
-
self.all_stat_count['step_error'] += len(except_id_list)
|
|
223
|
-
self.all_stat_count['xbrl'] += len(xbrl_data)
|
|
224
|
-
self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
|
|
225
|
-
self.send_xbrl_data_to_mongo(xbrl_data)
|
|
226
|
-
self.update_doc_status_to_convert(collection, doc_error_list)
|
|
227
|
-
process_data.clear()
|
|
228
|
-
perm_id_set.clear()
|
|
229
|
-
|
|
230
|
-
if process_data:
|
|
233
|
+
logger.info(f"load {source} data.")
|
|
234
|
+
docs = collection.find(query).batch_size(1000)
|
|
235
|
+
duplicate_docs = self.op_meat_deduplicate_docs(docs, buffer_size=self.max_batch_size) if not is_important else docs
|
|
236
|
+
for doc in duplicate_docs:
|
|
237
|
+
for orbit_entity_id in doc['x_orbit_data']['perm_id_list']:
|
|
238
|
+
perm_id_set.add(orbit_entity_id)
|
|
239
|
+
result_record = await self.data_processor.process(doc=doc, custom_process_step=custom_step, important_level=important_level)
|
|
240
|
+
process_data.append(result_record)
|
|
241
|
+
if len(process_data) >= self.max_batch_size:
|
|
231
242
|
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
232
243
|
process_data)
|
|
233
244
|
file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
|
|
234
|
-
self.all_stat_count['
|
|
245
|
+
self.all_stat_count['doc_error'] += len(doc_error_list)
|
|
235
246
|
self.all_stat_count['step_error'] += len(except_id_list)
|
|
236
247
|
self.all_stat_count['xbrl'] += len(xbrl_data)
|
|
237
248
|
self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
|
|
@@ -240,5 +251,18 @@ class FilingOfficialProcessor:
|
|
|
240
251
|
process_data.clear()
|
|
241
252
|
perm_id_set.clear()
|
|
242
253
|
|
|
254
|
+
if process_data:
|
|
255
|
+
file_flow_info, xbrl_data, except_id_list, doc_error_list = self.data_processor.split_data_by_spider_name_and_step(
|
|
256
|
+
process_data)
|
|
257
|
+
file_flow_info = self.update_extends_fields(list(perm_id_set), file_flow_info)
|
|
258
|
+
self.all_stat_count['doc_error'] += len(doc_error_list)
|
|
259
|
+
self.all_stat_count['step_error'] += len(except_id_list)
|
|
260
|
+
self.all_stat_count['xbrl'] += len(xbrl_data)
|
|
261
|
+
self.send_task(file_flow_info, tags, is_important, priority, spider_name_source_type)
|
|
262
|
+
self.send_xbrl_data_to_mongo(xbrl_data)
|
|
263
|
+
self.update_doc_status_to_convert(collection, doc_error_list)
|
|
264
|
+
process_data.clear()
|
|
265
|
+
perm_id_set.clear()
|
|
266
|
+
|
|
243
267
|
logger.info(f"finish processing {self.all_stat_count}. \n")
|
|
244
|
-
self.all_stat_count = {'all': 0, 'skip': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
268
|
+
self.all_stat_count = {'all': 0, 'skip': 0, 'doc_error': 0, 'step_error': 0, 'xbrl': 0, 'file_flow': 0}
|
|
@@ -44,7 +44,7 @@ class FileFlowHandleV2:
|
|
|
44
44
|
autoload_with=self.postgres_engine, schema='public'
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
-
self.not_allow_file_type_list = not_allow_file_type_list or [
|
|
47
|
+
self.not_allow_file_type_list = not_allow_file_type_list or []
|
|
48
48
|
|
|
49
49
|
@contextmanager
|
|
50
50
|
def session_scope(self):
|
|
@@ -94,20 +94,23 @@ class FileFlowHandleV2:
|
|
|
94
94
|
result = session.execute(stmt)
|
|
95
95
|
return [row[0] for row in result.fetchall()]
|
|
96
96
|
|
|
97
|
-
def _check_records(self, records: List[Dict[str, Any]], clean_exist_data: bool) -> Tuple[bool, List[str], str]:
|
|
97
|
+
def _check_records(self, records: List[Dict[str, Any]], clean_exist_data: bool) -> Tuple[bool, List[str], str, List[str]]:
|
|
98
98
|
ids = []
|
|
99
|
+
invalidate_ids = []
|
|
99
100
|
record_count = 0
|
|
100
101
|
for record in records:
|
|
101
102
|
record_count += 1
|
|
102
103
|
is_valid, msg = self._validate_record(record)
|
|
103
104
|
if not is_valid:
|
|
104
|
-
|
|
105
|
+
invalidate_ids.append(record.get('id', 'unknown'))
|
|
106
|
+
logger.error(f"Validation failed for record {record.get('id', 'unknown')}: {msg}")
|
|
107
|
+
continue
|
|
105
108
|
ids.append(record["id"])
|
|
106
109
|
|
|
107
110
|
existing_ids = self._get_existing_ids(ids)
|
|
108
111
|
if not clean_exist_data and len(existing_ids) == record_count:
|
|
109
|
-
return False, existing_ids, "No new data has been inserted."
|
|
110
|
-
return True, existing_ids, f"Validation complete. total: {len(records)}. {len(existing_ids)} records already exist."
|
|
112
|
+
return False, existing_ids, "No new data has been inserted.", invalidate_ids
|
|
113
|
+
return True, existing_ids, f"Validation complete. total: {len(records)}. {len(existing_ids)} records already exist.", invalidate_ids
|
|
111
114
|
|
|
112
115
|
def _build_insert_data(
|
|
113
116
|
self, record: Dict[str, Any], params: Dict[str, Any]
|
|
@@ -324,7 +327,7 @@ class FileFlowHandleV2:
|
|
|
324
327
|
elif not isinstance(records, list):
|
|
325
328
|
raise ValueError("records must be a dict or list of dicts.")
|
|
326
329
|
|
|
327
|
-
is_valid, existing_ids, msg = self._check_records(records, clean_exist_data)
|
|
330
|
+
is_valid, existing_ids, msg, invalidate_ids = self._check_records(records, clean_exist_data)
|
|
328
331
|
if not is_valid:
|
|
329
332
|
return False, existing_ids, msg
|
|
330
333
|
logger.info(msg)
|
|
@@ -334,6 +337,8 @@ class FileFlowHandleV2:
|
|
|
334
337
|
exist_data_ids = set()
|
|
335
338
|
count = 0
|
|
336
339
|
for record in records:
|
|
340
|
+
if record['id'] in invalidate_ids:
|
|
341
|
+
continue
|
|
337
342
|
count += 1
|
|
338
343
|
if record['id'] in existing_ids:
|
|
339
344
|
if clean_exist_data:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: orbitkit
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.49
|
|
4
4
|
Summary: This project is only for Orbit Tech internal use.
|
|
5
5
|
Home-page: https://github.com/clown-0726/orbitkit
|
|
6
6
|
Author: Lilu Cao
|
|
@@ -37,19 +37,6 @@ Requires-Dist: prettytable>=3.17.0
|
|
|
37
37
|
Requires-Dist: pytz>=2025.2
|
|
38
38
|
Requires-Dist: Deprecated
|
|
39
39
|
Requires-Dist: func_timeout
|
|
40
|
-
Dynamic: author
|
|
41
|
-
Dynamic: author-email
|
|
42
|
-
Dynamic: classifier
|
|
43
|
-
Dynamic: description
|
|
44
|
-
Dynamic: description-content-type
|
|
45
|
-
Dynamic: home-page
|
|
46
|
-
Dynamic: license
|
|
47
|
-
Dynamic: license-file
|
|
48
|
-
Dynamic: maintainer
|
|
49
|
-
Dynamic: maintainer-email
|
|
50
|
-
Dynamic: platform
|
|
51
|
-
Dynamic: requires-dist
|
|
52
|
-
Dynamic: summary
|
|
53
40
|
|
|
54
41
|
# orbitkit
|
|
55
42
|
|
orbitkit-0.8.48/orbitkit/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.48
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|