datamule 1.0.3__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {datamule-1.0.3 → datamule-1.0.7}/PKG-INFO +1 -4
  2. {datamule-1.0.3 → datamule-1.0.7}/datamule/__init__.py +2 -13
  3. {datamule-1.0.3 → datamule-1.0.7}/datamule/document.py +8 -9
  4. datamule-1.0.7/datamule/helper.py +103 -0
  5. datamule-1.0.7/datamule/portfolio.py +182 -0
  6. datamule-1.0.7/datamule/submission.py +38 -0
  7. {datamule-1.0.3 → datamule-1.0.7}/datamule.egg-info/PKG-INFO +1 -4
  8. datamule-1.0.7/datamule.egg-info/SOURCES.txt +12 -0
  9. {datamule-1.0.3 → datamule-1.0.7}/datamule.egg-info/requires.txt +1 -10
  10. datamule-1.0.7/setup.py +52 -0
  11. datamule-1.0.3/datamule/book/__init__.py +0 -0
  12. datamule-1.0.3/datamule/book/book.py +0 -34
  13. datamule-1.0.3/datamule/book/eftsquery.py +0 -127
  14. datamule-1.0.3/datamule/book/xbrl_retriever.py +0 -88
  15. datamule-1.0.3/datamule/data/company_former_names.csv +0 -8148
  16. datamule-1.0.3/datamule/data/company_metadata.csv +0 -10049
  17. datamule-1.0.3/datamule/data/company_tickers.csv +0 -9999
  18. datamule-1.0.3/datamule/data/sec-glossary.csv +0 -728
  19. datamule-1.0.3/datamule/data/xbrl_descriptions.csv +0 -10024
  20. datamule-1.0.3/datamule/downloader/downloader.py +0 -374
  21. datamule-1.0.3/datamule/downloader/premiumdownloader.py +0 -335
  22. datamule-1.0.3/datamule/helper.py +0 -123
  23. datamule-1.0.3/datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
  24. datamule-1.0.3/datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
  25. datamule-1.0.3/datamule/monitor.py +0 -283
  26. datamule-1.0.3/datamule/mulebot/__init__.py +0 -1
  27. datamule-1.0.3/datamule/mulebot/helper.py +0 -35
  28. datamule-1.0.3/datamule/mulebot/mulebot.py +0 -130
  29. datamule-1.0.3/datamule/mulebot/mulebot_server/__init__.py +0 -1
  30. datamule-1.0.3/datamule/mulebot/mulebot_server/server.py +0 -87
  31. datamule-1.0.3/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
  32. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
  33. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
  34. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
  35. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
  36. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
  37. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
  38. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
  39. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
  40. datamule-1.0.3/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
  41. datamule-1.0.3/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
  42. datamule-1.0.3/datamule/mulebot/search.py +0 -52
  43. datamule-1.0.3/datamule/mulebot/tools.py +0 -82
  44. datamule-1.0.3/datamule/packageupdater.py +0 -207
  45. datamule-1.0.3/datamule/portfolio.py +0 -106
  46. datamule-1.0.3/datamule/submission.py +0 -76
  47. datamule-1.0.3/datamule.egg-info/SOURCES.txt +0 -45
  48. datamule-1.0.3/setup.py +0 -47
  49. {datamule-1.0.3 → datamule-1.0.7}/datamule/config.py +0 -0
  50. {datamule-1.0.3 → datamule-1.0.7}/datamule.egg-info/dependency_links.txt +0 -0
  51. {datamule-1.0.3 → datamule-1.0.7}/datamule.egg-info/top_level.txt +0 -0
  52. {datamule-1.0.3 → datamule-1.0.7}/setup.cfg +0 -0
@@ -1,9 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.3
3
+ Version: 1.0.7
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
7
- Provides-Extra: mulebot
8
- Provides-Extra: mulebot_server
9
- Provides-Extra: all
@@ -1,12 +1,7 @@
1
- from .downloader.downloader import Downloader
2
- from .downloader.premiumdownloader import PremiumDownloader
3
- from .monitor import Monitor
4
- from .packageupdater import PackageUpdater
5
1
  from .submission import Submission
6
2
  from .portfolio import Portfolio
7
3
  from .document import Document
8
- from secsgml import parse_sgml_submission
9
- from .helper import load_package_csv, load_package_dataset
4
+ from .helper import _load_package_csv, load_package_dataset
10
5
  from .config import Config
11
6
 
12
7
 
@@ -32,16 +27,10 @@ def _setup_notebook_env():
32
27
  _setup_notebook_env()
33
28
 
34
29
  __all__ = [
35
- 'Downloader',
36
- 'PremiumDownloader',
37
- 'load_package_csv',
30
+ '_load_package_csv',
38
31
  'load_package_dataset',
39
- 'Filing',
40
32
  'Portfolio',
41
- 'Monitor',
42
- 'PackageUpdater',
43
33
  'Submission',
44
34
  'Document',
45
- 'parse_sgml_submission',
46
35
  'Config'
47
36
  ]
@@ -1,11 +1,10 @@
1
1
  import json
2
2
  import csv
3
- from .helper import convert_to_dashed_accession
4
3
  import re
5
4
  from doc2dict import xml2dict, txt2dict, dict2dict
6
5
  from doc2dict.mapping import flatten_hierarchy
7
- from .mapping_dicts import txt_mapping_dicts
8
- from .mapping_dicts import xml_mapping_dicts
6
+ from .mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
7
+ from .mapping_dicts.xml_mapping_dicts import dict_345
9
8
  from selectolax.parser import HTMLParser
10
9
 
11
10
  class Document:
@@ -107,7 +106,7 @@ class Document:
107
106
 
108
107
  if self.path.suffix == '.xml':
109
108
  if self.type in ['3', '4', '5']:
110
- mapping_dict = xml_mapping_dicts.dict_345
109
+ mapping_dict = dict_345
111
110
 
112
111
  self.load_content()
113
112
  self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
@@ -116,15 +115,15 @@ class Document:
116
115
  self._load_file_content()
117
116
 
118
117
  if self.type == '10-K':
119
- mapping_dict = txt_mapping_dicts.dict_10k
118
+ mapping_dict = dict_10k
120
119
  elif self.type == '10-Q':
121
- mapping_dict = txt_mapping_dicts.dict_10q
120
+ mapping_dict = dict_10q
122
121
  elif self.type == '8-K':
123
- mapping_dict = txt_mapping_dicts.dict_8k
122
+ mapping_dict = dict_8k
124
123
  elif self.type == 'SC 13D':
125
- mapping_dict = txt_mapping_dicts.dict_13d
124
+ mapping_dict = dict_13d
126
125
  elif self.type == 'SC 13G':
127
- mapping_dict = txt_mapping_dicts.dict_13g
126
+ mapping_dict = dict_13g
128
127
 
129
128
  self.data = {}
130
129
  self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
@@ -0,0 +1,103 @@
1
+ from functools import lru_cache
2
+ import csv
3
+ from pathlib import Path
4
+
5
+ def _load_package_csv(name):
6
+ """Load CSV files from ~/.datamule/ directory"""
7
+ data_dir = Path.home() / ".datamule"
8
+ csv_path = data_dir / f"{name}.csv"
9
+
10
+ data = []
11
+
12
+ with open(csv_path, 'r') as csvfile:
13
+ csv_reader = csv.DictReader(csvfile)
14
+ for row in csv_reader:
15
+ data.append(row)
16
+
17
+ return data
18
+
19
+ def load_package_dataset(dataset):
20
+ if dataset =='listed_filer_metadata':
21
+ return _load_package_csv('listed_filer_metadata')
22
+
23
+ @lru_cache(maxsize=128)
24
+ def get_cik_from_dataset(dataset_name, key, value):
25
+ dataset = load_package_dataset(dataset_name)
26
+
27
+ if dataset_name == 'listed_filer_metadata' and key == 'ticker':
28
+ key = 'tickers'
29
+
30
+ result = []
31
+ for company in dataset:
32
+ if key in ['tickers', 'exchanges'] and dataset_name == 'listed_filer_metadata':
33
+ # Parse the string representation of list into an actual list
34
+ list_values = [i.strip() for i in company[key][1:-1].replace("'", "").replace('"', '').split(',')]
35
+ if str(value) in list_values:
36
+ result.append(company['cik'])
37
+ elif str(value) == company[key]:
38
+ result.append(company['cik'])
39
+
40
+ return result
41
+
42
+
43
+
44
+ @lru_cache(maxsize=128)
45
+ def get_ciks_from_metadata_filters(**kwargs):
46
+ """Get CIKs from listed_filer_metadata.csv that match all provided filters."""
47
+
48
+ # Start with None to get all CIKs from first filter
49
+ result_ciks = None
50
+
51
+ # For each filter, get matching CIKs and keep intersection
52
+ for key, value in kwargs.items():
53
+ # Get CIKs for this filter
54
+ ciks = get_cik_from_dataset('listed_filer_metadata', key, value)
55
+ ciks = [int(cik) for cik in ciks]
56
+
57
+ # If this is the first filter, set as initial result
58
+ if result_ciks is None:
59
+ result_ciks = set(ciks)
60
+ # Otherwise, take intersection with previous results
61
+ else:
62
+ result_ciks &= set(ciks)
63
+
64
+ # If no matches left, we can exit early
65
+ if not result_ciks:
66
+ return []
67
+
68
+ return list(result_ciks)
69
+
70
+
71
+ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
72
+ """
73
+ Helper method to process CIK, ticker, and metadata filters.
74
+ Returns a list of CIKs after processing.
75
+ """
76
+ # Input validation
77
+ if cik is not None and ticker is not None:
78
+ raise ValueError("Only one of cik or ticker should be provided, not both.")
79
+
80
+ # Convert ticker to CIK if provided
81
+ if ticker is not None:
82
+ cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
83
+
84
+ # Normalize CIK format
85
+ if cik is not None:
86
+ if isinstance(cik, str):
87
+ cik = [int(cik)]
88
+ elif isinstance(cik, int):
89
+ cik = [cik]
90
+ elif isinstance(cik, list):
91
+ cik = [int(x) for x in cik]
92
+
93
+ # Process metadata filters if provided
94
+ if kwargs:
95
+ metadata_ciks = get_ciks_from_metadata_filters(**kwargs)
96
+
97
+ if cik is not None:
98
+ cik = list(set(cik).intersection(metadata_ciks))
99
+ else:
100
+ cik = metadata_ciks
101
+
102
+ return cik
103
+
@@ -0,0 +1,182 @@
1
+ from pathlib import Path
2
+ from tqdm import tqdm
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from .submission import Submission
5
+ from .sec.submissions.downloader import download as sec_download
6
+ from .sec.submissions.textsearch import filter_text
7
+ from .config import Config
8
+ import os
9
+ from .helper import _process_cik_and_metadata_filters
10
+ from .seclibrary.downloader import download as seclibrary_download
11
+ from .sec.xbrl.filter_xbrl import filter_xbrl
12
+ from .sec.submissions.monitor import monitor
13
+ from .sec.xbrl.xbrlmonitor import XBRLMonitor
14
+
15
+
16
+ class Portfolio:
17
+ def __init__(self, path):
18
+ self.path = Path(path)
19
+ self.submissions = []
20
+ self.submissions_loaded = False
21
+ self.MAX_WORKERS = os.cpu_count() - 1
22
+
23
+ if self.path.exists():
24
+ self._load_submissions()
25
+ self.submissions_loaded = True
26
+ else:
27
+ self.path.mkdir(parents=True, exist_ok=True)
28
+
29
+ def _load_submissions(self):
30
+ folders = [f for f in self.path.iterdir() if f.is_dir()]
31
+ print(f"Loading {len(folders)} submissions")
32
+
33
+ def load_submission(folder):
34
+ try:
35
+ return Submission(folder)
36
+ except Exception as e:
37
+ print(f"Error loading submission from {folder}: {str(e)}")
38
+ return None
39
+
40
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
41
+ self.submissions = list(tqdm(
42
+ executor.map(load_submission, folders),
43
+ total=len(folders),
44
+ desc="Loading submissions"
45
+ ))
46
+
47
+ # Filter out None values from failed submissions
48
+ self.submissions = [s for s in self.submissions if s is not None]
49
+ print(f"Successfully loaded {len(self.submissions)} submissions")
50
+
51
+ def process_submissions(self, callback):
52
+ """Process all submissions using a thread pool."""
53
+ if not self.submissions_loaded:
54
+ self._load_submissions()
55
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
56
+ results = list(tqdm(
57
+ executor.map(callback, self.submissions),
58
+ total=len(self.submissions),
59
+ desc="Processing submissions"
60
+ ))
61
+ return results
62
+
63
+ def process_documents(self, callback):
64
+ """Process all documents using a thread pool."""
65
+ if not self.submissions_loaded:
66
+ self._load_submissions()
67
+
68
+ documents = [doc for sub in self.submissions for doc in sub]
69
+
70
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
71
+ results = list(tqdm(
72
+ executor.map(callback, documents),
73
+ total=len(documents),
74
+ desc="Processing documents"
75
+ ))
76
+ return results
77
+
78
+ def filter_text(self, text_query, cik=None, ticker=None, submission_type=None, filing_date=None, **kwargs):
79
+ """
80
+ Filter text based on query and various parameters.
81
+ When called multiple times, takes the intersection of results.
82
+ Now supports metadata filters through kwargs.
83
+ """
84
+ # Process CIK and metadata filters
85
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
86
+
87
+ # Call the filter_text function with processed parameters
88
+ new_accession_numbers = filter_text(
89
+ text_query=text_query,
90
+ cik=cik,
91
+ submission_type=submission_type,
92
+ filing_date=filing_date
93
+ )
94
+
95
+ # If we already have accession numbers, take the intersection
96
+ if hasattr(self, 'accession_numbers') and self.accession_numbers:
97
+ self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
98
+ else:
99
+ # First query, just set the accession numbers
100
+ self.accession_numbers = new_accession_numbers
101
+
102
+ def filter_xbrl(self, taxonomy, concept, unit, period, logic, value):
103
+ """
104
+ Filter XBRL data based on logic and value.
105
+ """
106
+ new_accession_numbers = filter_xbrl(
107
+ taxonomy=taxonomy,
108
+ concept=concept,
109
+ unit=unit,
110
+ period=period,
111
+ logic=logic,
112
+ value=value
113
+ )
114
+
115
+ # If we already have accession numbers, take the intersection
116
+ if hasattr(self, 'accession_numbers') and self.accession_numbers:
117
+ self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
118
+ else:
119
+ # First query, just set the accession numbers
120
+ self.accession_numbers = new_accession_numbers
121
+
122
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
123
+ if provider is None:
124
+ config = Config()
125
+ provider = config.get_default_source()
126
+
127
+ # Process CIK and metadata filters
128
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
129
+
130
+ if provider == 'datamule':
131
+
132
+ seclibrary_download(
133
+ output_dir=self.path,
134
+ cik=cik,
135
+ submission_type=submission_type,
136
+ filing_date=filing_date,
137
+ accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
138
+ )
139
+ else:
140
+ sec_download(
141
+ output_dir=self.path,
142
+ cik=cik,
143
+ submission_type=submission_type,
144
+ filing_date=filing_date,
145
+ requests_per_second=5, # Revisit this later.
146
+ accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
+ )
148
+
149
+ self.submissions_loaded = False
150
+ def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
151
+ polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
152
+
153
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
154
+
155
+ monitor(
156
+ data_callback=data_callback,
157
+ poll_callback=poll_callback,
158
+ cik=cik,
159
+ submission_type=submission_type,
160
+ polling_interval=polling_interval,
161
+ requests_per_second=requests_per_second,
162
+ quiet=quiet,
163
+ start_date=start_date
164
+ )
165
+
166
+
167
+
168
+
169
+ def __iter__(self):
170
+ if not self.submissions_loaded:
171
+ self._load_submissions()
172
+ return iter(self.submissions)
173
+
174
+ def document_type(self, document_types):
175
+ """Filter documents by type(s)."""
176
+ if not self.submissions_loaded:
177
+ self._load_submissions()
178
+ if isinstance(document_types, str):
179
+ document_types = [document_types]
180
+
181
+ for submission in self.submissions:
182
+ yield from submission.document_type(document_types)
@@ -0,0 +1,38 @@
1
+ from pathlib import Path
2
+ import json
3
+ from .document import Document
4
+
5
+ class Submission:
6
+ def __init__(self, path):
7
+ self.path = Path(path)
8
+ self._load_metadata()
9
+
10
+ def _load_metadata(self):
11
+ metadata_path = self.path / 'metadata.json'
12
+ with metadata_path.open('r') as f:
13
+ self.metadata = json.load(f)
14
+
15
+ def document_type(self, document_type):
16
+ # Convert single document type to list for consistent handling
17
+ if isinstance(document_type, str):
18
+ document_types = [document_type]
19
+ else:
20
+ document_types = document_type
21
+
22
+ for doc in self.metadata['documents']:
23
+ if doc['type'] in document_types:
24
+ filename = doc.get('filename')
25
+ if filename is None:
26
+ continue
27
+
28
+ document_path = self.path / filename
29
+ yield Document(doc['type'], document_path)
30
+
31
+ def __iter__(self):
32
+ for doc in self.metadata['documents']:
33
+ filename = doc.get('filename')
34
+ if filename is None:
35
+ continue
36
+
37
+ document_path = self.path / filename
38
+ yield Document(doc['type'], document_path)
@@ -1,9 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.3
3
+ Version: 1.0.7
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
7
- Provides-Extra: mulebot
8
- Provides-Extra: mulebot_server
9
- Provides-Extra: all
@@ -0,0 +1,12 @@
1
+ setup.py
2
+ datamule/__init__.py
3
+ datamule/config.py
4
+ datamule/document.py
5
+ datamule/helper.py
6
+ datamule/portfolio.py
7
+ datamule/submission.py
8
+ datamule.egg-info/PKG-INFO
9
+ datamule.egg-info/SOURCES.txt
10
+ datamule.egg-info/dependency_links.txt
11
+ datamule.egg-info/requires.txt
12
+ datamule.egg-info/top_level.txt
@@ -11,13 +11,4 @@ pytz
11
11
  zstandard
12
12
  doc2dict
13
13
  secsgml
14
-
15
- [all]
16
- openai
17
- flask
18
-
19
- [mulebot]
20
- openai
21
-
22
- [mulebot_server]
23
- flask
14
+ lxml
@@ -0,0 +1,52 @@
1
+ from setuptools import setup
2
+ from setuptools import find_namespace_packages
3
+ import os
4
+ import gzip
5
+ import shutil
6
+ import urllib.request
7
+ from pathlib import Path
8
+
9
+ # Create data directory in user's home
10
+ data_dir = Path.home() / ".datamule"
11
+ data_dir.mkdir(exist_ok=True)
12
+
13
+ # Download data file
14
+ file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
15
+ file_path = data_dir / "listed_filer_metadata.csv"
16
+ temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
17
+
18
+ if not file_path.exists():
19
+ print(f"Downloading data to {data_dir}")
20
+ urllib.request.urlretrieve(file_url, temp_gz_path)
21
+
22
+ with gzip.open(temp_gz_path, 'rb') as f_in:
23
+ with open(file_path, 'wb') as f_out:
24
+ shutil.copyfileobj(f_in, f_out)
25
+
26
+ os.remove(temp_gz_path)
27
+ print(f"Data downloaded to {file_path}")
28
+
29
+ setup(
30
+ name="datamule",
31
+ author="John Friedman",
32
+ version="1.0.7",
33
+ description="Making it easier to use SEC filings.",
34
+ packages=find_namespace_packages(include=['datamule']),
35
+ url="https://github.com/john-friedman/datamule-python",
36
+ install_requires=[
37
+ 'aiohttp',
38
+ 'aiolimiter',
39
+ 'tqdm',
40
+ 'requests',
41
+ 'nest_asyncio',
42
+ 'aiofiles',
43
+ 'polars',
44
+ 'setuptools',
45
+ 'selectolax',
46
+ 'pytz',
47
+ 'zstandard',
48
+ 'doc2dict',
49
+ 'secsgml',
50
+ 'lxml'
51
+ ]
52
+ )
File without changes
@@ -1,34 +0,0 @@
1
- # Streams data rather than downloading it.
2
- # additional functionality such as query by xbrl, and other db
3
- # also this is basically our experimental rework of portfolio w/o disturbing existing users
4
- # this is highly experimental and may not work as expected
5
- # only for datamule source
6
- # likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
7
- # wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
8
-
9
- # example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
10
-
11
- # hmm do we go for sql esq or not.
12
- # I think we do.
13
- # i think we remove cik, ticker, sic, etc and just have a query object
14
- # should be sql esq so users can use it easily w/o learnign new syntax
15
-
16
- # WHERE submission_type = '10-K'
17
- # AND us-gaap:ResearchAndDevelopmentExpense > 0
18
- # AND dei:debt_to_equity < 2
19
- # AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
20
- # AND CIK in (123, 456, 789)
21
- # AND SIC in (123, 456, 789)
22
- # AND ticker in ('AAPL', 'GOOGL', 'AMZN')
23
- # AND document_type = 'EX-99.1' # to select attachments
24
-
25
- from .eftsquery import EFTSQuery
26
-
27
-
28
- class Book():
29
- def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
30
- xbrl_query={},
31
- metadata_callback=None,
32
- document_callback=None,):
33
- # grabs data and processes it
34
- pass
@@ -1,127 +0,0 @@
1
- import asyncio
2
- import aiohttp
3
- from tqdm import tqdm
4
- from datetime import datetime
5
- from urllib.parse import urlencode
6
- import time
7
-
8
- class PreciseRateLimiter:
9
- def __init__(self, rate=10, interval=1.0):
10
- self.rate = rate # requests per interval
11
- self.interval = interval # in seconds
12
- self.token_time = self.interval / self.rate # time per token
13
- self.last_time = time.time()
14
- self.lock = asyncio.Lock()
15
-
16
- async def acquire(self):
17
- async with self.lock:
18
- now = time.time()
19
- wait_time = self.last_time + self.token_time - now
20
- if wait_time > 0:
21
- await asyncio.sleep(wait_time)
22
- self.last_time = time.time()
23
- return True
24
-
25
- class EFTSQuery:
26
- def __init__(self):
27
- self.headers = {
28
- 'User-Agent': 'Your Name yourname@email.com',
29
- 'Accept-Encoding': 'gzip, deflate',
30
- 'Host': 'efts.sec.gov'
31
- }
32
- self.session = None
33
- self.limiter = PreciseRateLimiter(10)
34
-
35
- async def __aenter__(self):
36
- if not self.session:
37
- self.session = aiohttp.ClientSession(headers=self.headers)
38
- return self
39
-
40
- async def __aexit__(self, exc_type, exc_val, exc_tb):
41
- if self.session:
42
- await self.session.close()
43
- self.session = None
44
-
45
- async def _fetch_json(self, url):
46
- await self.limiter.acquire()
47
- try:
48
- async with self.session.get(url) as response:
49
- if response.status == 429:
50
- await asyncio.sleep(61)
51
- return await self._fetch_json(url)
52
- return await response.json()
53
- except Exception as e:
54
- print(f"Error fetching {url}: {str(e)}")
55
- return None
56
-
57
- async def _get_accession_numbers(self, base_url):
58
- data = await self._fetch_json(f"{base_url}&from=0&size=1")
59
- if not data or 'hits' not in data:
60
- return []
61
-
62
- total_hits = data['hits']['total']['value']
63
- if not total_hits:
64
- return []
65
-
66
- accession_numbers = []
67
- start = 0
68
- page_size = 100
69
- batch_size = 10 # Number of concurrent requests
70
-
71
- with tqdm(total=total_hits) as pbar:
72
- while start < total_hits:
73
- tasks = []
74
- for i in range(batch_size):
75
- if start + i * page_size >= total_hits:
76
- break
77
- url = f"{base_url}&from={start + i * page_size}&size={page_size}"
78
- tasks.append(self._fetch_json(url))
79
-
80
- if not tasks:
81
- break
82
-
83
- results = await asyncio.gather(*tasks)
84
-
85
- for data in results:
86
- if data and 'hits' in data:
87
- hits = data['hits']['hits']
88
- batch_numbers = [
89
- f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
90
- for hit in hits
91
- ]
92
- accession_numbers.extend(batch_numbers)
93
- pbar.update(len(hits))
94
-
95
- start += batch_size * page_size
96
-
97
- return accession_numbers
98
-
99
- def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
100
- async def _download():
101
- async with self as downloader:
102
- params = {}
103
-
104
- if cik:
105
- params['ciks'] = str(cik).zfill(10)
106
-
107
- if submission_type:
108
- params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
109
-
110
- if isinstance(filing_date, list):
111
- dates = [(d, d) for d in filing_date]
112
- elif isinstance(filing_date, tuple):
113
- dates = [filing_date]
114
- else:
115
- date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
116
- start, end = date_str.split(',')
117
- dates = [(start, end)]
118
-
119
- params['startdt'], params['enddt'] = dates[0]
120
-
121
- if search_text:
122
- params['q'] = f'"{search_text}"'
123
-
124
- base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
125
- return await self._get_accession_numbers(base_url)
126
-
127
- return asyncio.run(_download())