datamule 1.0.3__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.0.3 → datamule-1.0.6}/PKG-INFO +1 -4
- {datamule-1.0.3 → datamule-1.0.6}/datamule/__init__.py +2 -13
- {datamule-1.0.3 → datamule-1.0.6}/datamule/document.py +0 -1
- datamule-1.0.6/datamule/helper.py +103 -0
- datamule-1.0.6/datamule/portfolio.py +182 -0
- datamule-1.0.6/datamule/submission.py +38 -0
- {datamule-1.0.3 → datamule-1.0.6}/datamule.egg-info/PKG-INFO +1 -4
- datamule-1.0.6/datamule.egg-info/SOURCES.txt +12 -0
- {datamule-1.0.3 → datamule-1.0.6}/datamule.egg-info/requires.txt +1 -10
- datamule-1.0.6/setup.py +52 -0
- datamule-1.0.3/datamule/book/__init__.py +0 -0
- datamule-1.0.3/datamule/book/book.py +0 -34
- datamule-1.0.3/datamule/book/eftsquery.py +0 -127
- datamule-1.0.3/datamule/book/xbrl_retriever.py +0 -88
- datamule-1.0.3/datamule/data/company_former_names.csv +0 -8148
- datamule-1.0.3/datamule/data/company_metadata.csv +0 -10049
- datamule-1.0.3/datamule/data/company_tickers.csv +0 -9999
- datamule-1.0.3/datamule/data/sec-glossary.csv +0 -728
- datamule-1.0.3/datamule/data/xbrl_descriptions.csv +0 -10024
- datamule-1.0.3/datamule/downloader/downloader.py +0 -374
- datamule-1.0.3/datamule/downloader/premiumdownloader.py +0 -335
- datamule-1.0.3/datamule/helper.py +0 -123
- datamule-1.0.3/datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
- datamule-1.0.3/datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
- datamule-1.0.3/datamule/monitor.py +0 -283
- datamule-1.0.3/datamule/mulebot/__init__.py +0 -1
- datamule-1.0.3/datamule/mulebot/helper.py +0 -35
- datamule-1.0.3/datamule/mulebot/mulebot.py +0 -130
- datamule-1.0.3/datamule/mulebot/mulebot_server/__init__.py +0 -1
- datamule-1.0.3/datamule/mulebot/mulebot_server/server.py +0 -87
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
- datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
- datamule-1.0.3/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
- datamule-1.0.3/datamule/mulebot/search.py +0 -52
- datamule-1.0.3/datamule/mulebot/tools.py +0 -82
- datamule-1.0.3/datamule/packageupdater.py +0 -207
- datamule-1.0.3/datamule/portfolio.py +0 -106
- datamule-1.0.3/datamule/submission.py +0 -76
- datamule-1.0.3/datamule.egg-info/SOURCES.txt +0 -45
- datamule-1.0.3/setup.py +0 -47
- {datamule-1.0.3 → datamule-1.0.6}/datamule/config.py +0 -0
- {datamule-1.0.3 → datamule-1.0.6}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.0.3 → datamule-1.0.6}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.0.3 → datamule-1.0.6}/setup.cfg +0 -0
@@ -1,9 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.6
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
7
|
-
Provides-Extra: mulebot
|
8
|
-
Provides-Extra: mulebot_server
|
9
|
-
Provides-Extra: all
|
@@ -1,12 +1,7 @@
|
|
1
|
-
from .downloader.downloader import Downloader
|
2
|
-
from .downloader.premiumdownloader import PremiumDownloader
|
3
|
-
from .monitor import Monitor
|
4
|
-
from .packageupdater import PackageUpdater
|
5
1
|
from .submission import Submission
|
6
2
|
from .portfolio import Portfolio
|
7
3
|
from .document import Document
|
8
|
-
from
|
9
|
-
from .helper import load_package_csv, load_package_dataset
|
4
|
+
from .helper import _load_package_csv, load_package_dataset
|
10
5
|
from .config import Config
|
11
6
|
|
12
7
|
|
@@ -32,16 +27,10 @@ def _setup_notebook_env():
|
|
32
27
|
_setup_notebook_env()
|
33
28
|
|
34
29
|
__all__ = [
|
35
|
-
'
|
36
|
-
'PremiumDownloader',
|
37
|
-
'load_package_csv',
|
30
|
+
'_load_package_csv',
|
38
31
|
'load_package_dataset',
|
39
|
-
'Filing',
|
40
32
|
'Portfolio',
|
41
|
-
'Monitor',
|
42
|
-
'PackageUpdater',
|
43
33
|
'Submission',
|
44
34
|
'Document',
|
45
|
-
'parse_sgml_submission',
|
46
35
|
'Config'
|
47
36
|
]
|
@@ -0,0 +1,103 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
import csv
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
def _load_package_csv(name):
|
6
|
+
"""Load CSV files from ~/.datamule/ directory"""
|
7
|
+
data_dir = Path.home() / ".datamule"
|
8
|
+
csv_path = data_dir / f"{name}.csv"
|
9
|
+
|
10
|
+
data = []
|
11
|
+
|
12
|
+
with open(csv_path, 'r') as csvfile:
|
13
|
+
csv_reader = csv.DictReader(csvfile)
|
14
|
+
for row in csv_reader:
|
15
|
+
data.append(row)
|
16
|
+
|
17
|
+
return data
|
18
|
+
|
19
|
+
def load_package_dataset(dataset):
|
20
|
+
if dataset =='listed_filer_metadata':
|
21
|
+
return _load_package_csv('listed_filer_metadata')
|
22
|
+
|
23
|
+
@lru_cache(maxsize=128)
|
24
|
+
def get_cik_from_dataset(dataset_name, key, value):
|
25
|
+
dataset = load_package_dataset(dataset_name)
|
26
|
+
|
27
|
+
if dataset_name == 'listed_filer_metadata' and key == 'ticker':
|
28
|
+
key = 'tickers'
|
29
|
+
|
30
|
+
result = []
|
31
|
+
for company in dataset:
|
32
|
+
if key in ['tickers', 'exchanges'] and dataset_name == 'listed_filer_metadata':
|
33
|
+
# Parse the string representation of list into an actual list
|
34
|
+
list_values = [i.strip() for i in company[key][1:-1].replace("'", "").replace('"', '').split(',')]
|
35
|
+
if str(value) in list_values:
|
36
|
+
result.append(company['cik'])
|
37
|
+
elif str(value) == company[key]:
|
38
|
+
result.append(company['cik'])
|
39
|
+
|
40
|
+
return result
|
41
|
+
|
42
|
+
|
43
|
+
|
44
|
+
@lru_cache(maxsize=128)
|
45
|
+
def get_ciks_from_metadata_filters(**kwargs):
|
46
|
+
"""Get CIKs from listed_filer_metadata.csv that match all provided filters."""
|
47
|
+
|
48
|
+
# Start with None to get all CIKs from first filter
|
49
|
+
result_ciks = None
|
50
|
+
|
51
|
+
# For each filter, get matching CIKs and keep intersection
|
52
|
+
for key, value in kwargs.items():
|
53
|
+
# Get CIKs for this filter
|
54
|
+
ciks = get_cik_from_dataset('listed_filer_metadata', key, value)
|
55
|
+
ciks = [int(cik) for cik in ciks]
|
56
|
+
|
57
|
+
# If this is the first filter, set as initial result
|
58
|
+
if result_ciks is None:
|
59
|
+
result_ciks = set(ciks)
|
60
|
+
# Otherwise, take intersection with previous results
|
61
|
+
else:
|
62
|
+
result_ciks &= set(ciks)
|
63
|
+
|
64
|
+
# If no matches left, we can exit early
|
65
|
+
if not result_ciks:
|
66
|
+
return []
|
67
|
+
|
68
|
+
return list(result_ciks)
|
69
|
+
|
70
|
+
|
71
|
+
def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
|
72
|
+
"""
|
73
|
+
Helper method to process CIK, ticker, and metadata filters.
|
74
|
+
Returns a list of CIKs after processing.
|
75
|
+
"""
|
76
|
+
# Input validation
|
77
|
+
if cik is not None and ticker is not None:
|
78
|
+
raise ValueError("Only one of cik or ticker should be provided, not both.")
|
79
|
+
|
80
|
+
# Convert ticker to CIK if provided
|
81
|
+
if ticker is not None:
|
82
|
+
cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
|
83
|
+
|
84
|
+
# Normalize CIK format
|
85
|
+
if cik is not None:
|
86
|
+
if isinstance(cik, str):
|
87
|
+
cik = [int(cik)]
|
88
|
+
elif isinstance(cik, int):
|
89
|
+
cik = [cik]
|
90
|
+
elif isinstance(cik, list):
|
91
|
+
cik = [int(x) for x in cik]
|
92
|
+
|
93
|
+
# Process metadata filters if provided
|
94
|
+
if kwargs:
|
95
|
+
metadata_ciks = get_ciks_from_metadata_filters(**kwargs)
|
96
|
+
|
97
|
+
if cik is not None:
|
98
|
+
cik = list(set(cik).intersection(metadata_ciks))
|
99
|
+
else:
|
100
|
+
cik = metadata_ciks
|
101
|
+
|
102
|
+
return cik
|
103
|
+
|
@@ -0,0 +1,182 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from tqdm import tqdm
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
4
|
+
from .submission import Submission
|
5
|
+
from .sec.submissions.downloader import download as sec_download
|
6
|
+
from .sec.submissions.textsearch import filter_text
|
7
|
+
from .config import Config
|
8
|
+
import os
|
9
|
+
from .helper import _process_cik_and_metadata_filters
|
10
|
+
from .seclibrary.downloader import download as seclibrary_download
|
11
|
+
from .sec.xbrl.filter_xbrl import filter_xbrl
|
12
|
+
from .sec.submissions.monitor import monitor
|
13
|
+
from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
14
|
+
|
15
|
+
|
16
|
+
class Portfolio:
|
17
|
+
def __init__(self, path):
|
18
|
+
self.path = Path(path)
|
19
|
+
self.submissions = []
|
20
|
+
self.submissions_loaded = False
|
21
|
+
self.MAX_WORKERS = os.cpu_count() - 1
|
22
|
+
|
23
|
+
if self.path.exists():
|
24
|
+
self._load_submissions()
|
25
|
+
self.submissions_loaded = True
|
26
|
+
else:
|
27
|
+
self.path.mkdir(parents=True, exist_ok=True)
|
28
|
+
|
29
|
+
def _load_submissions(self):
|
30
|
+
folders = [f for f in self.path.iterdir() if f.is_dir()]
|
31
|
+
print(f"Loading {len(folders)} submissions")
|
32
|
+
|
33
|
+
def load_submission(folder):
|
34
|
+
try:
|
35
|
+
return Submission(folder)
|
36
|
+
except Exception as e:
|
37
|
+
print(f"Error loading submission from {folder}: {str(e)}")
|
38
|
+
return None
|
39
|
+
|
40
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
41
|
+
self.submissions = list(tqdm(
|
42
|
+
executor.map(load_submission, folders),
|
43
|
+
total=len(folders),
|
44
|
+
desc="Loading submissions"
|
45
|
+
))
|
46
|
+
|
47
|
+
# Filter out None values from failed submissions
|
48
|
+
self.submissions = [s for s in self.submissions if s is not None]
|
49
|
+
print(f"Successfully loaded {len(self.submissions)} submissions")
|
50
|
+
|
51
|
+
def process_submissions(self, callback):
|
52
|
+
"""Process all submissions using a thread pool."""
|
53
|
+
if not self.submissions_loaded:
|
54
|
+
self._load_submissions()
|
55
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
56
|
+
results = list(tqdm(
|
57
|
+
executor.map(callback, self.submissions),
|
58
|
+
total=len(self.submissions),
|
59
|
+
desc="Processing submissions"
|
60
|
+
))
|
61
|
+
return results
|
62
|
+
|
63
|
+
def process_documents(self, callback):
|
64
|
+
"""Process all documents using a thread pool."""
|
65
|
+
if not self.submissions_loaded:
|
66
|
+
self._load_submissions()
|
67
|
+
|
68
|
+
documents = [doc for sub in self.submissions for doc in sub]
|
69
|
+
|
70
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
71
|
+
results = list(tqdm(
|
72
|
+
executor.map(callback, documents),
|
73
|
+
total=len(documents),
|
74
|
+
desc="Processing documents"
|
75
|
+
))
|
76
|
+
return results
|
77
|
+
|
78
|
+
def filter_text(self, text_query, cik=None, ticker=None, submission_type=None, filing_date=None, **kwargs):
|
79
|
+
"""
|
80
|
+
Filter text based on query and various parameters.
|
81
|
+
When called multiple times, takes the intersection of results.
|
82
|
+
Now supports metadata filters through kwargs.
|
83
|
+
"""
|
84
|
+
# Process CIK and metadata filters
|
85
|
+
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
86
|
+
|
87
|
+
# Call the filter_text function with processed parameters
|
88
|
+
new_accession_numbers = filter_text(
|
89
|
+
text_query=text_query,
|
90
|
+
cik=cik,
|
91
|
+
submission_type=submission_type,
|
92
|
+
filing_date=filing_date
|
93
|
+
)
|
94
|
+
|
95
|
+
# If we already have accession numbers, take the intersection
|
96
|
+
if hasattr(self, 'accession_numbers') and self.accession_numbers:
|
97
|
+
self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
|
98
|
+
else:
|
99
|
+
# First query, just set the accession numbers
|
100
|
+
self.accession_numbers = new_accession_numbers
|
101
|
+
|
102
|
+
def filter_xbrl(self, taxonomy, concept, unit, period, logic, value):
|
103
|
+
"""
|
104
|
+
Filter XBRL data based on logic and value.
|
105
|
+
"""
|
106
|
+
new_accession_numbers = filter_xbrl(
|
107
|
+
taxonomy=taxonomy,
|
108
|
+
concept=concept,
|
109
|
+
unit=unit,
|
110
|
+
period=period,
|
111
|
+
logic=logic,
|
112
|
+
value=value
|
113
|
+
)
|
114
|
+
|
115
|
+
# If we already have accession numbers, take the intersection
|
116
|
+
if hasattr(self, 'accession_numbers') and self.accession_numbers:
|
117
|
+
self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
|
118
|
+
else:
|
119
|
+
# First query, just set the accession numbers
|
120
|
+
self.accession_numbers = new_accession_numbers
|
121
|
+
|
122
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
|
123
|
+
if provider is None:
|
124
|
+
config = Config()
|
125
|
+
provider = config.get_default_source()
|
126
|
+
|
127
|
+
# Process CIK and metadata filters
|
128
|
+
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
129
|
+
|
130
|
+
if provider == 'datamule':
|
131
|
+
|
132
|
+
seclibrary_download(
|
133
|
+
output_dir=self.path,
|
134
|
+
cik=cik,
|
135
|
+
submission_type=submission_type,
|
136
|
+
filing_date=filing_date,
|
137
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
138
|
+
)
|
139
|
+
else:
|
140
|
+
sec_download(
|
141
|
+
output_dir=self.path,
|
142
|
+
cik=cik,
|
143
|
+
submission_type=submission_type,
|
144
|
+
filing_date=filing_date,
|
145
|
+
requests_per_second=5, # Revisit this later.
|
146
|
+
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
|
+
)
|
148
|
+
|
149
|
+
self.submissions_loaded = False
|
150
|
+
def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
|
151
|
+
polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
|
152
|
+
|
153
|
+
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
154
|
+
|
155
|
+
monitor(
|
156
|
+
data_callback=data_callback,
|
157
|
+
poll_callback=poll_callback,
|
158
|
+
cik=cik,
|
159
|
+
submission_type=submission_type,
|
160
|
+
polling_interval=polling_interval,
|
161
|
+
requests_per_second=requests_per_second,
|
162
|
+
quiet=quiet,
|
163
|
+
start_date=start_date
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
def __iter__(self):
|
170
|
+
if not self.submissions_loaded:
|
171
|
+
self._load_submissions()
|
172
|
+
return iter(self.submissions)
|
173
|
+
|
174
|
+
def document_type(self, document_types):
|
175
|
+
"""Filter documents by type(s)."""
|
176
|
+
if not self.submissions_loaded:
|
177
|
+
self._load_submissions()
|
178
|
+
if isinstance(document_types, str):
|
179
|
+
document_types = [document_types]
|
180
|
+
|
181
|
+
for submission in self.submissions:
|
182
|
+
yield from submission.document_type(document_types)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import json
|
3
|
+
from .document import Document
|
4
|
+
|
5
|
+
class Submission:
|
6
|
+
def __init__(self, path):
|
7
|
+
self.path = Path(path)
|
8
|
+
self._load_metadata()
|
9
|
+
|
10
|
+
def _load_metadata(self):
|
11
|
+
metadata_path = self.path / 'metadata.json'
|
12
|
+
with metadata_path.open('r') as f:
|
13
|
+
self.metadata = json.load(f)
|
14
|
+
|
15
|
+
def document_type(self, document_type):
|
16
|
+
# Convert single document type to list for consistent handling
|
17
|
+
if isinstance(document_type, str):
|
18
|
+
document_types = [document_type]
|
19
|
+
else:
|
20
|
+
document_types = document_type
|
21
|
+
|
22
|
+
for doc in self.metadata['documents']:
|
23
|
+
if doc['type'] in document_types:
|
24
|
+
filename = doc.get('filename')
|
25
|
+
if filename is None:
|
26
|
+
continue
|
27
|
+
|
28
|
+
document_path = self.path / filename
|
29
|
+
yield Document(doc['type'], document_path)
|
30
|
+
|
31
|
+
def __iter__(self):
|
32
|
+
for doc in self.metadata['documents']:
|
33
|
+
filename = doc.get('filename')
|
34
|
+
if filename is None:
|
35
|
+
continue
|
36
|
+
|
37
|
+
document_path = self.path / filename
|
38
|
+
yield Document(doc['type'], document_path)
|
@@ -1,9 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.6
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
7
|
-
Provides-Extra: mulebot
|
8
|
-
Provides-Extra: mulebot_server
|
9
|
-
Provides-Extra: all
|
@@ -0,0 +1,12 @@
|
|
1
|
+
setup.py
|
2
|
+
datamule/__init__.py
|
3
|
+
datamule/config.py
|
4
|
+
datamule/document.py
|
5
|
+
datamule/helper.py
|
6
|
+
datamule/portfolio.py
|
7
|
+
datamule/submission.py
|
8
|
+
datamule.egg-info/PKG-INFO
|
9
|
+
datamule.egg-info/SOURCES.txt
|
10
|
+
datamule.egg-info/dependency_links.txt
|
11
|
+
datamule.egg-info/requires.txt
|
12
|
+
datamule.egg-info/top_level.txt
|
datamule-1.0.6/setup.py
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
from setuptools import setup
|
2
|
+
from setuptools import find_namespace_packages
|
3
|
+
import os
|
4
|
+
import gzip
|
5
|
+
import shutil
|
6
|
+
import urllib.request
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Create data directory in user's home
|
10
|
+
data_dir = Path.home() / ".datamule"
|
11
|
+
data_dir.mkdir(exist_ok=True)
|
12
|
+
|
13
|
+
# Download data file
|
14
|
+
file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
|
15
|
+
file_path = data_dir / "listed_filer_metadata.csv"
|
16
|
+
temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
|
17
|
+
|
18
|
+
if not file_path.exists():
|
19
|
+
print(f"Downloading data to {data_dir}")
|
20
|
+
urllib.request.urlretrieve(file_url, temp_gz_path)
|
21
|
+
|
22
|
+
with gzip.open(temp_gz_path, 'rb') as f_in:
|
23
|
+
with open(file_path, 'wb') as f_out:
|
24
|
+
shutil.copyfileobj(f_in, f_out)
|
25
|
+
|
26
|
+
os.remove(temp_gz_path)
|
27
|
+
print(f"Data downloaded to {file_path}")
|
28
|
+
|
29
|
+
setup(
|
30
|
+
name="datamule",
|
31
|
+
author="John Friedman",
|
32
|
+
version="1.0.6",
|
33
|
+
description="Making it easier to use SEC filings.",
|
34
|
+
packages=find_namespace_packages(include=['datamule']),
|
35
|
+
url="https://github.com/john-friedman/datamule-python",
|
36
|
+
install_requires=[
|
37
|
+
'aiohttp',
|
38
|
+
'aiolimiter',
|
39
|
+
'tqdm',
|
40
|
+
'requests',
|
41
|
+
'nest_asyncio',
|
42
|
+
'aiofiles',
|
43
|
+
'polars',
|
44
|
+
'setuptools',
|
45
|
+
'selectolax',
|
46
|
+
'pytz',
|
47
|
+
'zstandard',
|
48
|
+
'doc2dict',
|
49
|
+
'secsgml',
|
50
|
+
'lxml'
|
51
|
+
]
|
52
|
+
)
|
File without changes
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# Streams data rather than downloading it.
|
2
|
-
# additional functionality such as query by xbrl, and other db
|
3
|
-
# also this is basically our experimental rework of portfolio w/o disturbing existing users
|
4
|
-
# this is highly experimental and may not work as expected
|
5
|
-
# only for datamule source
|
6
|
-
# likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
|
7
|
-
# wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
|
8
|
-
|
9
|
-
# example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
|
10
|
-
|
11
|
-
# hmm do we go for sql esq or not.
|
12
|
-
# I think we do.
|
13
|
-
# i think we remove cik, ticker, sic, etc and just have a query object
|
14
|
-
# should be sql esq so users can use it easily w/o learnign new syntax
|
15
|
-
|
16
|
-
# WHERE submission_type = '10-K'
|
17
|
-
# AND us-gaap:ResearchAndDevelopmentExpense > 0
|
18
|
-
# AND dei:debt_to_equity < 2
|
19
|
-
# AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
|
20
|
-
# AND CIK in (123, 456, 789)
|
21
|
-
# AND SIC in (123, 456, 789)
|
22
|
-
# AND ticker in ('AAPL', 'GOOGL', 'AMZN')
|
23
|
-
# AND document_type = 'EX-99.1' # to select attachments
|
24
|
-
|
25
|
-
from .eftsquery import EFTSQuery
|
26
|
-
|
27
|
-
|
28
|
-
class Book():
|
29
|
-
def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
|
30
|
-
xbrl_query={},
|
31
|
-
metadata_callback=None,
|
32
|
-
document_callback=None,):
|
33
|
-
# grabs data and processes it
|
34
|
-
pass
|
@@ -1,127 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import aiohttp
|
3
|
-
from tqdm import tqdm
|
4
|
-
from datetime import datetime
|
5
|
-
from urllib.parse import urlencode
|
6
|
-
import time
|
7
|
-
|
8
|
-
class PreciseRateLimiter:
|
9
|
-
def __init__(self, rate=10, interval=1.0):
|
10
|
-
self.rate = rate # requests per interval
|
11
|
-
self.interval = interval # in seconds
|
12
|
-
self.token_time = self.interval / self.rate # time per token
|
13
|
-
self.last_time = time.time()
|
14
|
-
self.lock = asyncio.Lock()
|
15
|
-
|
16
|
-
async def acquire(self):
|
17
|
-
async with self.lock:
|
18
|
-
now = time.time()
|
19
|
-
wait_time = self.last_time + self.token_time - now
|
20
|
-
if wait_time > 0:
|
21
|
-
await asyncio.sleep(wait_time)
|
22
|
-
self.last_time = time.time()
|
23
|
-
return True
|
24
|
-
|
25
|
-
class EFTSQuery:
|
26
|
-
def __init__(self):
|
27
|
-
self.headers = {
|
28
|
-
'User-Agent': 'Your Name yourname@email.com',
|
29
|
-
'Accept-Encoding': 'gzip, deflate',
|
30
|
-
'Host': 'efts.sec.gov'
|
31
|
-
}
|
32
|
-
self.session = None
|
33
|
-
self.limiter = PreciseRateLimiter(10)
|
34
|
-
|
35
|
-
async def __aenter__(self):
|
36
|
-
if not self.session:
|
37
|
-
self.session = aiohttp.ClientSession(headers=self.headers)
|
38
|
-
return self
|
39
|
-
|
40
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
41
|
-
if self.session:
|
42
|
-
await self.session.close()
|
43
|
-
self.session = None
|
44
|
-
|
45
|
-
async def _fetch_json(self, url):
|
46
|
-
await self.limiter.acquire()
|
47
|
-
try:
|
48
|
-
async with self.session.get(url) as response:
|
49
|
-
if response.status == 429:
|
50
|
-
await asyncio.sleep(61)
|
51
|
-
return await self._fetch_json(url)
|
52
|
-
return await response.json()
|
53
|
-
except Exception as e:
|
54
|
-
print(f"Error fetching {url}: {str(e)}")
|
55
|
-
return None
|
56
|
-
|
57
|
-
async def _get_accession_numbers(self, base_url):
|
58
|
-
data = await self._fetch_json(f"{base_url}&from=0&size=1")
|
59
|
-
if not data or 'hits' not in data:
|
60
|
-
return []
|
61
|
-
|
62
|
-
total_hits = data['hits']['total']['value']
|
63
|
-
if not total_hits:
|
64
|
-
return []
|
65
|
-
|
66
|
-
accession_numbers = []
|
67
|
-
start = 0
|
68
|
-
page_size = 100
|
69
|
-
batch_size = 10 # Number of concurrent requests
|
70
|
-
|
71
|
-
with tqdm(total=total_hits) as pbar:
|
72
|
-
while start < total_hits:
|
73
|
-
tasks = []
|
74
|
-
for i in range(batch_size):
|
75
|
-
if start + i * page_size >= total_hits:
|
76
|
-
break
|
77
|
-
url = f"{base_url}&from={start + i * page_size}&size={page_size}"
|
78
|
-
tasks.append(self._fetch_json(url))
|
79
|
-
|
80
|
-
if not tasks:
|
81
|
-
break
|
82
|
-
|
83
|
-
results = await asyncio.gather(*tasks)
|
84
|
-
|
85
|
-
for data in results:
|
86
|
-
if data and 'hits' in data:
|
87
|
-
hits = data['hits']['hits']
|
88
|
-
batch_numbers = [
|
89
|
-
f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
|
90
|
-
for hit in hits
|
91
|
-
]
|
92
|
-
accession_numbers.extend(batch_numbers)
|
93
|
-
pbar.update(len(hits))
|
94
|
-
|
95
|
-
start += batch_size * page_size
|
96
|
-
|
97
|
-
return accession_numbers
|
98
|
-
|
99
|
-
def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
|
100
|
-
async def _download():
|
101
|
-
async with self as downloader:
|
102
|
-
params = {}
|
103
|
-
|
104
|
-
if cik:
|
105
|
-
params['ciks'] = str(cik).zfill(10)
|
106
|
-
|
107
|
-
if submission_type:
|
108
|
-
params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
|
109
|
-
|
110
|
-
if isinstance(filing_date, list):
|
111
|
-
dates = [(d, d) for d in filing_date]
|
112
|
-
elif isinstance(filing_date, tuple):
|
113
|
-
dates = [filing_date]
|
114
|
-
else:
|
115
|
-
date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
|
116
|
-
start, end = date_str.split(',')
|
117
|
-
dates = [(start, end)]
|
118
|
-
|
119
|
-
params['startdt'], params['enddt'] = dates[0]
|
120
|
-
|
121
|
-
if search_text:
|
122
|
-
params['q'] = f'"{search_text}"'
|
123
|
-
|
124
|
-
base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
|
125
|
-
return await self._get_accession_numbers(base_url)
|
126
|
-
|
127
|
-
return asyncio.run(_download())
|