datamule 1.0.2__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datamule/__init__.py +2 -13
  2. datamule/document.py +0 -1
  3. datamule/helper.py +85 -105
  4. datamule/portfolio.py +105 -29
  5. datamule/submission.py +0 -38
  6. {datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/METADATA +2 -8
  7. datamule-1.0.6.dist-info/RECORD +10 -0
  8. datamule/book/__init__.py +0 -0
  9. datamule/book/book.py +0 -34
  10. datamule/book/eftsquery.py +0 -127
  11. datamule/book/xbrl_retriever.py +0 -88
  12. datamule/data/company_former_names.csv +0 -8148
  13. datamule/data/company_metadata.csv +0 -10049
  14. datamule/data/company_tickers.csv +0 -9999
  15. datamule/data/sec-glossary.csv +0 -728
  16. datamule/data/xbrl_descriptions.csv +0 -10024
  17. datamule/downloader/downloader.py +0 -374
  18. datamule/downloader/premiumdownloader.py +0 -335
  19. datamule/mapping_dicts/txt_mapping_dicts.py +0 -232
  20. datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
  21. datamule/monitor.py +0 -238
  22. datamule/mulebot/__init__.py +0 -1
  23. datamule/mulebot/helper.py +0 -35
  24. datamule/mulebot/mulebot.py +0 -130
  25. datamule/mulebot/mulebot_server/__init__.py +0 -1
  26. datamule/mulebot/mulebot_server/server.py +0 -87
  27. datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
  28. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
  29. datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
  30. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
  31. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
  32. datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
  33. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
  34. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
  35. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
  36. datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
  37. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
  38. datamule/mulebot/search.py +0 -52
  39. datamule/mulebot/tools.py +0 -82
  40. datamule/packageupdater.py +0 -207
  41. datamule-1.0.2.dist-info/RECORD +0 -43
  42. {datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/WHEEL +0 -0
  43. {datamule-1.0.2.dist-info → datamule-1.0.6.dist-info}/top_level.txt +0 -0
datamule/__init__.py CHANGED
@@ -1,12 +1,7 @@
1
- from .downloader.downloader import Downloader
2
- from .downloader.premiumdownloader import PremiumDownloader
3
- from .monitor import Monitor
4
- from .packageupdater import PackageUpdater
5
1
  from .submission import Submission
6
2
  from .portfolio import Portfolio
7
3
  from .document import Document
8
- from secsgml import parse_sgml_submission
9
- from .helper import load_package_csv, load_package_dataset
4
+ from .helper import _load_package_csv, load_package_dataset
10
5
  from .config import Config
11
6
 
12
7
 
@@ -32,16 +27,10 @@ def _setup_notebook_env():
32
27
  _setup_notebook_env()
33
28
 
34
29
  __all__ = [
35
- 'Downloader',
36
- 'PremiumDownloader',
37
- 'load_package_csv',
30
+ '_load_package_csv',
38
31
  'load_package_dataset',
39
- 'Filing',
40
32
  'Portfolio',
41
- 'Monitor',
42
- 'PackageUpdater',
43
33
  'Submission',
44
34
  'Document',
45
- 'parse_sgml_submission',
46
35
  'Config'
47
36
  ]
datamule/document.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import csv
3
- from .helper import convert_to_dashed_accession
4
3
  import re
5
4
  from doc2dict import xml2dict, txt2dict, dict2dict
6
5
  from doc2dict.mapping import flatten_hierarchy
datamule/helper.py CHANGED
@@ -1,123 +1,103 @@
1
- import requests
2
- import os
3
- from tqdm import tqdm
4
- import zipfile
5
- from pkg_resources import resource_filename
1
+ from functools import lru_cache
6
2
  import csv
7
- import re
3
+ from pathlib import Path
8
4
 
9
- # Unused in current implementation.
10
- def construct_primary_doc_url(cik, accession_number,primary_doc_url):
11
- accession_number = accession_number.replace("-", "")
12
- return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
13
-
14
- # DONE
15
- def _download_from_dropbox(url, output_path):
16
- headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
17
- r = requests.get(url, stream=True, headers=headers)
18
- total_size = int(r.headers.get('content-length', 0))
5
+ def _load_package_csv(name):
6
+ """Load CSV files from ~/.datamule/ directory"""
7
+ data_dir = Path.home() / ".datamule"
8
+ csv_path = data_dir / f"{name}.csv"
19
9
 
20
- with open(output_path, 'wb') as f, tqdm(
21
- desc="Downloading " + os.path.basename(output_path),
22
- total=total_size,
23
- unit='iB',
24
- unit_scale=True,
25
- unit_divisor=1024,
26
- ) as progress_bar:
27
- for chunk in r.iter_content(chunk_size=1024):
28
- size = f.write(chunk)
29
- progress_bar.update(size)
30
-
31
- # Check if the downloaded file is a zip file
32
- if zipfile.is_zipfile(output_path):
33
- extract_path = os.path.dirname(output_path)
34
- with zipfile.ZipFile(output_path, 'r') as zip_ref:
35
- for file_info in zip_ref.infolist():
36
- extract_file_path = os.path.join(extract_path, file_info.filename)
37
- with zip_ref.open(file_info) as file_in_zip, \
38
- open(extract_file_path, 'wb') as output_file, \
39
- tqdm(total=file_info.file_size, unit='B', unit_scale=True,
40
- desc=f"Extracting {file_info.filename}") as pbar:
41
- while True:
42
- chunk = file_in_zip.read(8192)
43
- if not chunk:
44
- break
45
- output_file.write(chunk)
46
- pbar.update(len(chunk))
47
-
48
- # Remove the zip file after extraction
49
- os.remove(output_path)
50
- print(f"Extracted contents to {extract_path}")
51
- else:
52
- print(f"Downloaded file is not a zip. Saved to {output_path}")
53
-
54
- # May generalize to load any package resource
55
- def load_package_csv(name):
56
- """Load package CSV files"""
57
- csv_path = resource_filename('datamule', f'data/{name}.csv')
58
- company_tickers = []
10
+ data = []
59
11
 
60
12
  with open(csv_path, 'r') as csvfile:
61
13
  csv_reader = csv.DictReader(csvfile)
62
14
  for row in csv_reader:
63
- company_tickers.append(row)
15
+ data.append(row)
64
16
 
65
- return company_tickers
17
+ return data
66
18
 
67
19
  def load_package_dataset(dataset):
68
- if dataset == 'company_tickers':
69
- return load_package_csv('company_tickers')
70
- elif dataset =='company_former_names':
71
- return load_package_csv('company_former_names')
72
- elif dataset =='company_metadata':
73
- return load_package_csv('company_metadata')
74
- elif dataset == 'sec_glossary':
75
- return load_package_csv('sec-glossary')
76
- elif dataset == 'xbrl_descriptions':
77
- return load_package_csv('xbrl_descriptions')
20
+ if dataset =='listed_filer_metadata':
21
+ return _load_package_csv('listed_filer_metadata')
78
22
 
79
- # DONE
80
- def identifier_to_cik(ticker):
81
- """Convert company tickers to CIK codes"""
82
- company_tickers = load_package_csv('company_tickers')
83
- if ticker:
84
- if isinstance(ticker, list):
85
- cik = []
86
- for t in ticker:
87
- cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
88
- else:
89
- cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
90
-
91
- if not cik:
92
- raise ValueError("No matching companies found")
93
-
94
- return cik
23
+ @lru_cache(maxsize=128)
24
+ def get_cik_from_dataset(dataset_name, key, value):
25
+ dataset = load_package_dataset(dataset_name)
26
+
27
+ if dataset_name == 'listed_filer_metadata' and key == 'ticker':
28
+ key = 'tickers'
29
+
30
+ result = []
31
+ for company in dataset:
32
+ if key in ['tickers', 'exchanges'] and dataset_name == 'listed_filer_metadata':
33
+ # Parse the string representation of list into an actual list
34
+ list_values = [i.strip() for i in company[key][1:-1].replace("'", "").replace('"', '').split(',')]
35
+ if str(value) in list_values:
36
+ result.append(company['cik'])
37
+ elif str(value) == company[key]:
38
+ result.append(company['cik'])
39
+
40
+ return result
95
41
 
96
42
 
97
- def fix_filing_url(url):
98
- match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
99
- if match_suffix:
100
- suffix_number = match_suffix.group(1)
101
- file_ext = match_suffix.group(2)
102
- match_accession = re.search(r'/(\d{18})/', url)
103
- if match_accession:
104
- accession_number = match_accession.group(1)
105
- formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
106
- new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
107
- return new_url
108
- return url
109
43
 
110
- def convert_to_dashed_accession(accession):
111
- # Remove any existing dashes or whitespace
112
- cleaned = ''.join(accession.split())
44
+ @lru_cache(maxsize=128)
45
+ def get_ciks_from_metadata_filters(**kwargs):
46
+ """Get CIKs from listed_filer_metadata.csv that match all provided filters."""
113
47
 
114
- # Check if the cleaned string has 18 characters
115
- if len(cleaned) != 18:
116
- raise ValueError("Invalid accession number format. Expected 18 characters.")
48
+ # Start with None to get all CIKs from first filter
49
+ result_ciks = None
117
50
 
118
- # Insert dashes at the correct positions
119
- dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
51
+ # For each filter, get matching CIKs and keep intersection
52
+ for key, value in kwargs.items():
53
+ # Get CIKs for this filter
54
+ ciks = get_cik_from_dataset('listed_filer_metadata', key, value)
55
+ ciks = [int(cik) for cik in ciks]
56
+
57
+ # If this is the first filter, set as initial result
58
+ if result_ciks is None:
59
+ result_ciks = set(ciks)
60
+ # Otherwise, take intersection with previous results
61
+ else:
62
+ result_ciks &= set(ciks)
63
+
64
+ # If no matches left, we can exit early
65
+ if not result_ciks:
66
+ return []
120
67
 
121
- return dashed
68
+ return list(result_ciks)
69
+
70
+
71
+ def _process_cik_and_metadata_filters(cik=None, ticker=None, **kwargs):
72
+ """
73
+ Helper method to process CIK, ticker, and metadata filters.
74
+ Returns a list of CIKs after processing.
75
+ """
76
+ # Input validation
77
+ if cik is not None and ticker is not None:
78
+ raise ValueError("Only one of cik or ticker should be provided, not both.")
79
+
80
+ # Convert ticker to CIK if provided
81
+ if ticker is not None:
82
+ cik = get_cik_from_dataset('listed_filer_metadata', 'ticker', ticker)
83
+
84
+ # Normalize CIK format
85
+ if cik is not None:
86
+ if isinstance(cik, str):
87
+ cik = [int(cik)]
88
+ elif isinstance(cik, int):
89
+ cik = [cik]
90
+ elif isinstance(cik, list):
91
+ cik = [int(x) for x in cik]
92
+
93
+ # Process metadata filters if provided
94
+ if kwargs:
95
+ metadata_ciks = get_ciks_from_metadata_filters(**kwargs)
122
96
 
123
- headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
97
+ if cik is not None:
98
+ cik = list(set(cik).intersection(metadata_ciks))
99
+ else:
100
+ cik = metadata_ciks
101
+
102
+ return cik
103
+
datamule/portfolio.py CHANGED
@@ -2,19 +2,29 @@ from pathlib import Path
2
2
  from tqdm import tqdm
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from .submission import Submission
5
- from .downloader.premiumdownloader import PremiumDownloader
6
- from .downloader.downloader import Downloader
5
+ from .sec.submissions.downloader import download as sec_download
6
+ from .sec.submissions.textsearch import filter_text
7
7
  from .config import Config
8
8
  import os
9
+ from .helper import _process_cik_and_metadata_filters
10
+ from .seclibrary.downloader import download as seclibrary_download
11
+ from .sec.xbrl.filter_xbrl import filter_xbrl
12
+ from .sec.submissions.monitor import monitor
13
+ from .sec.xbrl.xbrlmonitor import XBRLMonitor
14
+
9
15
 
10
16
  class Portfolio:
11
17
  def __init__(self, path):
12
18
  self.path = Path(path)
13
19
  self.submissions = []
20
+ self.submissions_loaded = False
14
21
  self.MAX_WORKERS = os.cpu_count() - 1
15
22
 
16
23
  if self.path.exists():
17
24
  self._load_submissions()
25
+ self.submissions_loaded = True
26
+ else:
27
+ self.path.mkdir(parents=True, exist_ok=True)
18
28
 
19
29
  def _load_submissions(self):
20
30
  folders = [f for f in self.path.iterdir() if f.is_dir()]
@@ -40,6 +50,8 @@ class Portfolio:
40
50
 
41
51
  def process_submissions(self, callback):
42
52
  """Process all submissions using a thread pool."""
53
+ if not self.submissions_loaded:
54
+ self._load_submissions()
43
55
  with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
44
56
  results = list(tqdm(
45
57
  executor.map(callback, self.submissions),
@@ -50,6 +62,9 @@ class Portfolio:
50
62
 
51
63
  def process_documents(self, callback):
52
64
  """Process all documents using a thread pool."""
65
+ if not self.submissions_loaded:
66
+ self._load_submissions()
67
+
53
68
  documents = [doc for sub in self.submissions for doc in sub]
54
69
 
55
70
  with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
@@ -59,48 +74,109 @@ class Portfolio:
59
74
  desc="Processing documents"
60
75
  ))
61
76
  return results
77
+
78
+ def filter_text(self, text_query, cik=None, ticker=None, submission_type=None, filing_date=None, **kwargs):
79
+ """
80
+ Filter text based on query and various parameters.
81
+ When called multiple times, takes the intersection of results.
82
+ Now supports metadata filters through kwargs.
83
+ """
84
+ # Process CIK and metadata filters
85
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
86
+
87
+ # Call the filter_text function with processed parameters
88
+ new_accession_numbers = filter_text(
89
+ text_query=text_query,
90
+ cik=cik,
91
+ submission_type=submission_type,
92
+ filing_date=filing_date
93
+ )
94
+
95
+ # If we already have accession numbers, take the intersection
96
+ if hasattr(self, 'accession_numbers') and self.accession_numbers:
97
+ self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
98
+ else:
99
+ # First query, just set the accession numbers
100
+ self.accession_numbers = new_accession_numbers
101
+
102
+ def filter_xbrl(self, taxonomy, concept, unit, period, logic, value):
103
+ """
104
+ Filter XBRL data based on logic and value.
105
+ """
106
+ new_accession_numbers = filter_xbrl(
107
+ taxonomy=taxonomy,
108
+ concept=concept,
109
+ unit=unit,
110
+ period=period,
111
+ logic=logic,
112
+ value=value
113
+ )
114
+
115
+ # If we already have accession numbers, take the intersection
116
+ if hasattr(self, 'accession_numbers') and self.accession_numbers:
117
+ self.accession_numbers = list(set(self.accession_numbers).intersection(new_accession_numbers))
118
+ else:
119
+ # First query, just set the accession numbers
120
+ self.accession_numbers = new_accession_numbers
62
121
 
63
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
122
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
64
123
  if provider is None:
65
124
  config = Config()
66
125
  provider = config.get_default_source()
67
126
 
68
- downloader = PremiumDownloader() if provider == 'datamule' else Downloader()
69
- downloader.download_submissions(
70
- output_dir=self.path,
127
+ # Process CIK and metadata filters
128
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
129
+
130
+ if provider == 'datamule':
131
+
132
+ seclibrary_download(
133
+ output_dir=self.path,
134
+ cik=cik,
135
+ submission_type=submission_type,
136
+ filing_date=filing_date,
137
+ accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
138
+ )
139
+ else:
140
+ sec_download(
141
+ output_dir=self.path,
142
+ cik=cik,
143
+ submission_type=submission_type,
144
+ filing_date=filing_date,
145
+ requests_per_second=5, # Revisit this later.
146
+ accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
+ )
148
+
149
+ self.submissions_loaded = False
150
+ def monitor_submissions(self,data_callback=None, poll_callback=None, submission_type=None, cik=None,
151
+ polling_interval=200, requests_per_second=5, quiet=False, start_date=None, ticker=None, **kwargs):
152
+
153
+ cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
154
+
155
+ monitor(
156
+ data_callback=data_callback,
157
+ poll_callback=poll_callback,
71
158
  cik=cik,
72
- ticker=ticker,
73
159
  submission_type=submission_type,
74
- filing_date=filing_date
160
+ polling_interval=polling_interval,
161
+ requests_per_second=requests_per_second,
162
+ quiet=quiet,
163
+ start_date=start_date
75
164
  )
76
-
77
- # Reload submissions after download
78
- self._load_submissions()
79
165
 
166
+
167
+
168
+
80
169
  def __iter__(self):
170
+ if not self.submissions_loaded:
171
+ self._load_submissions()
81
172
  return iter(self.submissions)
82
173
 
83
174
  def document_type(self, document_types):
84
175
  """Filter documents by type(s)."""
176
+ if not self.submissions_loaded:
177
+ self._load_submissions()
85
178
  if isinstance(document_types, str):
86
179
  document_types = [document_types]
87
180
 
88
181
  for submission in self.submissions:
89
- yield from submission.document_type(document_types)
90
-
91
- def contains_string(self, pattern, document_types=None):
92
- """Search for pattern in documents, with optional type filter."""
93
- def check_document(document):
94
- return document if document.contains_string(pattern) else None
95
-
96
- # Get documents, filtered by type if specified
97
- documents = list(self.document_type(document_types)) if document_types else [
98
- doc for sub in self.submissions for doc in sub
99
- ]
100
-
101
- with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
102
- results = executor.map(check_document, documents)
103
-
104
- for doc in tqdm(results, total=len(documents), desc=f"Searching for '{pattern}'"):
105
- if doc is not None:
106
- yield doc
182
+ yield from submission.document_type(document_types)
datamule/submission.py CHANGED
@@ -11,44 +11,6 @@ class Submission:
11
11
  metadata_path = self.path / 'metadata.json'
12
12
  with metadata_path.open('r') as f:
13
13
  self.metadata = json.load(f)
14
-
15
- def keep(self, document_types):
16
- """Keep files of specified document types, delete others
17
- Args:
18
- document_types: string or list of strings representing document types to keep
19
- """
20
- # Convert single string to list for consistent handling
21
- if isinstance(document_types, str):
22
- document_types = [document_types]
23
-
24
- for doc in self.metadata['documents']:
25
- filename = doc.get('filename')
26
- if filename is None:
27
- continue
28
-
29
- filepath = self.path / filename
30
- # Delete if document type isn't in our keep list
31
- if doc['type'] not in document_types and filepath.exists():
32
- filepath.unlink()
33
-
34
- def drop(self, document_types):
35
- """Delete files of specified document types, keep others
36
- Args:
37
- document_types: string or list of strings representing document types to drop
38
- """
39
- # Convert single string to list for consistent handling
40
- if isinstance(document_types, str):
41
- document_types = [document_types]
42
-
43
- for doc in self.metadata['documents']:
44
- filename = doc.get('filename')
45
- if filename is None:
46
- continue
47
-
48
- filepath = self.path / filename
49
- # Delete if document type is in our drop list
50
- if doc['type'] in document_types and filepath.exists():
51
- filepath.unlink()
52
14
 
53
15
  def document_type(self, document_type):
54
16
  # Convert single document type to list for consistent handling
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.0.2
3
+ Version: 1.0.6
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -17,11 +17,5 @@ Requires-Dist: pytz
17
17
  Requires-Dist: zstandard
18
18
  Requires-Dist: doc2dict
19
19
  Requires-Dist: secsgml
20
- Provides-Extra: all
21
- Requires-Dist: openai; extra == "all"
22
- Requires-Dist: flask; extra == "all"
23
- Provides-Extra: mulebot
24
- Requires-Dist: openai; extra == "mulebot"
25
- Provides-Extra: mulebot_server
26
- Requires-Dist: flask; extra == "mulebot-server"
20
+ Requires-Dist: lxml
27
21
 
@@ -0,0 +1,10 @@
1
+ datamule/__init__.py,sha256=0npnB3i2F7YB7etG315oDiCd-eMo-A6MP5LX2gQclHY,914
2
+ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
+ datamule/document.py,sha256=uohyX7pt_nSHOS1y02fOuwjqYewKD9HgIdBwCtOlKx8,10864
4
+ datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
5
+ datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
6
+ datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
7
+ datamule-1.0.6.dist-info/METADATA,sha256=n53ZBeKhntC3jX6su9jbKPr9WxSohgOvvLC7sIbYwhk,512
8
+ datamule-1.0.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
9
+ datamule-1.0.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
10
+ datamule-1.0.6.dist-info/RECORD,,
datamule/book/__init__.py DELETED
File without changes
datamule/book/book.py DELETED
@@ -1,34 +0,0 @@
1
- # Streams data rather than downloading it.
2
- # additional functionality such as query by xbrl, and other db
3
- # also this is basically our experimental rework of portfolio w/o disturbing existing users
4
- # this is highly experimental and may not work as expected
5
- # only for datamule source
6
- # likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
7
- # wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
8
-
9
- # example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
10
-
11
- # hmm do we go for sql esq or not.
12
- # I think we do.
13
- # i think we remove cik, ticker, sic, etc and just have a query object
14
- # should be sql esq so users can use it easily w/o learnign new syntax
15
-
16
- # WHERE submission_type = '10-K'
17
- # AND us-gaap:ResearchAndDevelopmentExpense > 0
18
- # AND dei:debt_to_equity < 2
19
- # AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
20
- # AND CIK in (123, 456, 789)
21
- # AND SIC in (123, 456, 789)
22
- # AND ticker in ('AAPL', 'GOOGL', 'AMZN')
23
- # AND document_type = 'EX-99.1' # to select attachments
24
-
25
- from .eftsquery import EFTSQuery
26
-
27
-
28
- class Book():
29
- def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
30
- xbrl_query={},
31
- metadata_callback=None,
32
- document_callback=None,):
33
- # grabs data and processes it
34
- pass
@@ -1,127 +0,0 @@
1
- import asyncio
2
- import aiohttp
3
- from tqdm import tqdm
4
- from datetime import datetime
5
- from urllib.parse import urlencode
6
- import time
7
-
8
- class PreciseRateLimiter:
9
- def __init__(self, rate=10, interval=1.0):
10
- self.rate = rate # requests per interval
11
- self.interval = interval # in seconds
12
- self.token_time = self.interval / self.rate # time per token
13
- self.last_time = time.time()
14
- self.lock = asyncio.Lock()
15
-
16
- async def acquire(self):
17
- async with self.lock:
18
- now = time.time()
19
- wait_time = self.last_time + self.token_time - now
20
- if wait_time > 0:
21
- await asyncio.sleep(wait_time)
22
- self.last_time = time.time()
23
- return True
24
-
25
- class EFTSQuery:
26
- def __init__(self):
27
- self.headers = {
28
- 'User-Agent': 'Your Name yourname@email.com',
29
- 'Accept-Encoding': 'gzip, deflate',
30
- 'Host': 'efts.sec.gov'
31
- }
32
- self.session = None
33
- self.limiter = PreciseRateLimiter(10)
34
-
35
- async def __aenter__(self):
36
- if not self.session:
37
- self.session = aiohttp.ClientSession(headers=self.headers)
38
- return self
39
-
40
- async def __aexit__(self, exc_type, exc_val, exc_tb):
41
- if self.session:
42
- await self.session.close()
43
- self.session = None
44
-
45
- async def _fetch_json(self, url):
46
- await self.limiter.acquire()
47
- try:
48
- async with self.session.get(url) as response:
49
- if response.status == 429:
50
- await asyncio.sleep(61)
51
- return await self._fetch_json(url)
52
- return await response.json()
53
- except Exception as e:
54
- print(f"Error fetching {url}: {str(e)}")
55
- return None
56
-
57
- async def _get_accession_numbers(self, base_url):
58
- data = await self._fetch_json(f"{base_url}&from=0&size=1")
59
- if not data or 'hits' not in data:
60
- return []
61
-
62
- total_hits = data['hits']['total']['value']
63
- if not total_hits:
64
- return []
65
-
66
- accession_numbers = []
67
- start = 0
68
- page_size = 100
69
- batch_size = 10 # Number of concurrent requests
70
-
71
- with tqdm(total=total_hits) as pbar:
72
- while start < total_hits:
73
- tasks = []
74
- for i in range(batch_size):
75
- if start + i * page_size >= total_hits:
76
- break
77
- url = f"{base_url}&from={start + i * page_size}&size={page_size}"
78
- tasks.append(self._fetch_json(url))
79
-
80
- if not tasks:
81
- break
82
-
83
- results = await asyncio.gather(*tasks)
84
-
85
- for data in results:
86
- if data and 'hits' in data:
87
- hits = data['hits']['hits']
88
- batch_numbers = [
89
- f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
90
- for hit in hits
91
- ]
92
- accession_numbers.extend(batch_numbers)
93
- pbar.update(len(hits))
94
-
95
- start += batch_size * page_size
96
-
97
- return accession_numbers
98
-
99
- def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
100
- async def _download():
101
- async with self as downloader:
102
- params = {}
103
-
104
- if cik:
105
- params['ciks'] = str(cik).zfill(10)
106
-
107
- if submission_type:
108
- params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
109
-
110
- if isinstance(filing_date, list):
111
- dates = [(d, d) for d in filing_date]
112
- elif isinstance(filing_date, tuple):
113
- dates = [filing_date]
114
- else:
115
- date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
116
- start, end = date_str.split(',')
117
- dates = [(start, end)]
118
-
119
- params['startdt'], params['enddt'] = dates[0]
120
-
121
- if search_text:
122
- params['q'] = f'"{search_text}"'
123
-
124
- base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
125
- return await self._get_accession_numbers(base_url)
126
-
127
- return asyncio.run(_download())