datamule 0.428__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-0.428 → datamule-1.0.3}/PKG-INFO +1 -2
- datamule-1.0.3/datamule/__init__.py +47 -0
- datamule-1.0.3/datamule/book/__init__.py +0 -0
- datamule-1.0.3/datamule/book/book.py +34 -0
- datamule-1.0.3/datamule/book/eftsquery.py +127 -0
- datamule-1.0.3/datamule/book/xbrl_retriever.py +88 -0
- datamule-1.0.3/datamule/document.py +279 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/downloader/downloader.py +18 -8
- {datamule-0.428 → datamule-1.0.3}/datamule/downloader/premiumdownloader.py +8 -5
- datamule-1.0.3/datamule/mapping_dicts/txt_mapping_dicts.py +234 -0
- datamule-1.0.3/datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/monitor.py +57 -10
- datamule-1.0.3/datamule/portfolio.py +106 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/submission.py +9 -9
- {datamule-0.428 → datamule-1.0.3}/datamule.egg-info/PKG-INFO +1 -2
- {datamule-0.428 → datamule-1.0.3}/datamule.egg-info/SOURCES.txt +7 -17
- {datamule-0.428 → datamule-1.0.3}/datamule.egg-info/requires.txt +2 -8
- datamule-1.0.3/setup.py +47 -0
- datamule-0.428/datamule/__init__.py +0 -74
- datamule-0.428/datamule/dataset_builder/dataset_builder.py +0 -259
- datamule-0.428/datamule/document.py +0 -142
- datamule-0.428/datamule/parser/document_parsing/basic_10k_parser.py +0 -82
- datamule-0.428/datamule/parser/document_parsing/basic_10q_parser.py +0 -73
- datamule-0.428/datamule/parser/document_parsing/basic_13d_parser.py +0 -58
- datamule-0.428/datamule/parser/document_parsing/basic_13g_parser.py +0 -61
- datamule-0.428/datamule/parser/document_parsing/basic_8k_parser.py +0 -84
- datamule-0.428/datamule/parser/document_parsing/form_d_parser.py +0 -70
- datamule-0.428/datamule/parser/document_parsing/generalized_item_parser.py +0 -78
- datamule-0.428/datamule/parser/document_parsing/helper.py +0 -75
- datamule-0.428/datamule/parser/document_parsing/information_table_parser_13fhr.py +0 -41
- datamule-0.428/datamule/parser/document_parsing/insider_trading_parser.py +0 -158
- datamule-0.428/datamule/parser/document_parsing/mappings.py +0 -95
- datamule-0.428/datamule/parser/document_parsing/n_port_p_parser.py +0 -70
- datamule-0.428/datamule/parser/document_parsing/sec_parser.py +0 -73
- datamule-0.428/datamule/parser/document_parsing/sgml_parser.py +0 -94
- datamule-0.428/datamule/parser/sgml_parsing/sgml_parser_cy.c +0 -20006
- datamule-0.428/datamule/portfolio.py +0 -78
- datamule-0.428/setup.py +0 -93
- {datamule-0.428 → datamule-1.0.3}/datamule/config.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/data/company_former_names.csv +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/data/company_metadata.csv +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/data/company_tickers.csv +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/data/sec-glossary.csv +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/data/xbrl_descriptions.csv +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/helper.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/__init__.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/helper.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/__init__.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/server.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/search.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/mulebot/tools.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule/packageupdater.py +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-0.428 → datamule-1.0.3}/datamule.egg-info/top_level.txt +0 -0
- {datamule-0.428 → datamule-1.0.3}/setup.cfg +0 -0
@@ -1,10 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.3
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
7
7
|
Provides-Extra: mulebot
|
8
8
|
Provides-Extra: mulebot_server
|
9
|
-
Provides-Extra: dataset_builder
|
10
9
|
Provides-Extra: all
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from .downloader.downloader import Downloader
|
2
|
+
from .downloader.premiumdownloader import PremiumDownloader
|
3
|
+
from .monitor import Monitor
|
4
|
+
from .packageupdater import PackageUpdater
|
5
|
+
from .submission import Submission
|
6
|
+
from .portfolio import Portfolio
|
7
|
+
from .document import Document
|
8
|
+
from secsgml import parse_sgml_submission
|
9
|
+
from .helper import load_package_csv, load_package_dataset
|
10
|
+
from .config import Config
|
11
|
+
|
12
|
+
|
13
|
+
# Keep the notebook environment setup
|
14
|
+
def _is_notebook_env():
|
15
|
+
"""Check if the code is running in a Jupyter or Colab environment."""
|
16
|
+
try:
|
17
|
+
shell = get_ipython().__class__.__name__
|
18
|
+
return shell in ('ZMQInteractiveShell', 'Shell', 'Google.Colab')
|
19
|
+
except NameError:
|
20
|
+
return False
|
21
|
+
|
22
|
+
from functools import lru_cache
|
23
|
+
|
24
|
+
@lru_cache(maxsize=1)
|
25
|
+
def _setup_notebook_env():
|
26
|
+
"""Setup Jupyter/Colab-specific configurations if needed."""
|
27
|
+
if _is_notebook_env():
|
28
|
+
import nest_asyncio
|
29
|
+
nest_asyncio.apply()
|
30
|
+
|
31
|
+
# Set up notebook environment
|
32
|
+
_setup_notebook_env()
|
33
|
+
|
34
|
+
__all__ = [
|
35
|
+
'Downloader',
|
36
|
+
'PremiumDownloader',
|
37
|
+
'load_package_csv',
|
38
|
+
'load_package_dataset',
|
39
|
+
'Filing',
|
40
|
+
'Portfolio',
|
41
|
+
'Monitor',
|
42
|
+
'PackageUpdater',
|
43
|
+
'Submission',
|
44
|
+
'Document',
|
45
|
+
'parse_sgml_submission',
|
46
|
+
'Config'
|
47
|
+
]
|
File without changes
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# Streams data rather than downloading it.
|
2
|
+
# additional functionality such as query by xbrl, and other db
|
3
|
+
# also this is basically our experimental rework of portfolio w/o disturbing existing users
|
4
|
+
# this is highly experimental and may not work as expected
|
5
|
+
# only for datamule source
|
6
|
+
# likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
|
7
|
+
# wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
|
8
|
+
|
9
|
+
# example queries filter by sic = 7372, xbrl query = dei:operatingprofit > 0 in date range 2018-2019
|
10
|
+
|
11
|
+
# hmm do we go for sql esq or not.
|
12
|
+
# I think we do.
|
13
|
+
# i think we remove cik, ticker, sic, etc and just have a query object
|
14
|
+
# should be sql esq so users can use it easily w/o learnign new syntax
|
15
|
+
|
16
|
+
# WHERE submission_type = '10-K'
|
17
|
+
# AND us-gaap:ResearchAndDevelopmentExpense > 0
|
18
|
+
# AND dei:debt_to_equity < 2
|
19
|
+
# AND filing_date BETWEEN '2023-01-01' AND '2023-12-31'
|
20
|
+
# AND CIK in (123, 456, 789)
|
21
|
+
# AND SIC in (123, 456, 789)
|
22
|
+
# AND ticker in ('AAPL', 'GOOGL', 'AMZN')
|
23
|
+
# AND document_type = 'EX-99.1' # to select attachments
|
24
|
+
|
25
|
+
from .eftsquery import EFTSQuery
|
26
|
+
|
27
|
+
|
28
|
+
class Book():
|
29
|
+
def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
|
30
|
+
xbrl_query={},
|
31
|
+
metadata_callback=None,
|
32
|
+
document_callback=None,):
|
33
|
+
# grabs data and processes it
|
34
|
+
pass
|
@@ -0,0 +1,127 @@
|
|
1
|
+
import asyncio
|
2
|
+
import aiohttp
|
3
|
+
from tqdm import tqdm
|
4
|
+
from datetime import datetime
|
5
|
+
from urllib.parse import urlencode
|
6
|
+
import time
|
7
|
+
|
8
|
+
class PreciseRateLimiter:
|
9
|
+
def __init__(self, rate=10, interval=1.0):
|
10
|
+
self.rate = rate # requests per interval
|
11
|
+
self.interval = interval # in seconds
|
12
|
+
self.token_time = self.interval / self.rate # time per token
|
13
|
+
self.last_time = time.time()
|
14
|
+
self.lock = asyncio.Lock()
|
15
|
+
|
16
|
+
async def acquire(self):
|
17
|
+
async with self.lock:
|
18
|
+
now = time.time()
|
19
|
+
wait_time = self.last_time + self.token_time - now
|
20
|
+
if wait_time > 0:
|
21
|
+
await asyncio.sleep(wait_time)
|
22
|
+
self.last_time = time.time()
|
23
|
+
return True
|
24
|
+
|
25
|
+
class EFTSQuery:
|
26
|
+
def __init__(self):
|
27
|
+
self.headers = {
|
28
|
+
'User-Agent': 'Your Name yourname@email.com',
|
29
|
+
'Accept-Encoding': 'gzip, deflate',
|
30
|
+
'Host': 'efts.sec.gov'
|
31
|
+
}
|
32
|
+
self.session = None
|
33
|
+
self.limiter = PreciseRateLimiter(10)
|
34
|
+
|
35
|
+
async def __aenter__(self):
|
36
|
+
if not self.session:
|
37
|
+
self.session = aiohttp.ClientSession(headers=self.headers)
|
38
|
+
return self
|
39
|
+
|
40
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
41
|
+
if self.session:
|
42
|
+
await self.session.close()
|
43
|
+
self.session = None
|
44
|
+
|
45
|
+
async def _fetch_json(self, url):
|
46
|
+
await self.limiter.acquire()
|
47
|
+
try:
|
48
|
+
async with self.session.get(url) as response:
|
49
|
+
if response.status == 429:
|
50
|
+
await asyncio.sleep(61)
|
51
|
+
return await self._fetch_json(url)
|
52
|
+
return await response.json()
|
53
|
+
except Exception as e:
|
54
|
+
print(f"Error fetching {url}: {str(e)}")
|
55
|
+
return None
|
56
|
+
|
57
|
+
async def _get_accession_numbers(self, base_url):
|
58
|
+
data = await self._fetch_json(f"{base_url}&from=0&size=1")
|
59
|
+
if not data or 'hits' not in data:
|
60
|
+
return []
|
61
|
+
|
62
|
+
total_hits = data['hits']['total']['value']
|
63
|
+
if not total_hits:
|
64
|
+
return []
|
65
|
+
|
66
|
+
accession_numbers = []
|
67
|
+
start = 0
|
68
|
+
page_size = 100
|
69
|
+
batch_size = 10 # Number of concurrent requests
|
70
|
+
|
71
|
+
with tqdm(total=total_hits) as pbar:
|
72
|
+
while start < total_hits:
|
73
|
+
tasks = []
|
74
|
+
for i in range(batch_size):
|
75
|
+
if start + i * page_size >= total_hits:
|
76
|
+
break
|
77
|
+
url = f"{base_url}&from={start + i * page_size}&size={page_size}"
|
78
|
+
tasks.append(self._fetch_json(url))
|
79
|
+
|
80
|
+
if not tasks:
|
81
|
+
break
|
82
|
+
|
83
|
+
results = await asyncio.gather(*tasks)
|
84
|
+
|
85
|
+
for data in results:
|
86
|
+
if data and 'hits' in data:
|
87
|
+
hits = data['hits']['hits']
|
88
|
+
batch_numbers = [
|
89
|
+
f"{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}"
|
90
|
+
for hit in hits
|
91
|
+
]
|
92
|
+
accession_numbers.extend(batch_numbers)
|
93
|
+
pbar.update(len(hits))
|
94
|
+
|
95
|
+
start += batch_size * page_size
|
96
|
+
|
97
|
+
return accession_numbers
|
98
|
+
|
99
|
+
def query_efts(self, cik=None, ticker=None, submission_type=None, filing_date=None, search_text=None):
|
100
|
+
async def _download():
|
101
|
+
async with self as downloader:
|
102
|
+
params = {}
|
103
|
+
|
104
|
+
if cik:
|
105
|
+
params['ciks'] = str(cik).zfill(10)
|
106
|
+
|
107
|
+
if submission_type:
|
108
|
+
params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type
|
109
|
+
|
110
|
+
if isinstance(filing_date, list):
|
111
|
+
dates = [(d, d) for d in filing_date]
|
112
|
+
elif isinstance(filing_date, tuple):
|
113
|
+
dates = [filing_date]
|
114
|
+
else:
|
115
|
+
date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
|
116
|
+
start, end = date_str.split(',')
|
117
|
+
dates = [(start, end)]
|
118
|
+
|
119
|
+
params['startdt'], params['enddt'] = dates[0]
|
120
|
+
|
121
|
+
if search_text:
|
122
|
+
params['q'] = f'"{search_text}"'
|
123
|
+
|
124
|
+
base_url = f"https://efts.sec.gov/LATEST/search-index?{urlencode(params, doseq=True)}"
|
125
|
+
return await self._get_accession_numbers(base_url)
|
126
|
+
|
127
|
+
return asyncio.run(_download())
|
@@ -0,0 +1,88 @@
|
|
1
|
+
import asyncio
|
2
|
+
import aiohttp
|
3
|
+
import time
|
4
|
+
|
5
|
+
class PreciseRateLimiter:
|
6
|
+
def __init__(self, rate=10, interval=1.0):
|
7
|
+
self.rate = rate
|
8
|
+
self.interval = interval
|
9
|
+
self.token_time = self.interval / self.rate
|
10
|
+
self.last_time = time.time()
|
11
|
+
self.lock = asyncio.Lock()
|
12
|
+
|
13
|
+
async def acquire(self):
|
14
|
+
async with self.lock:
|
15
|
+
now = time.time()
|
16
|
+
wait_time = self.last_time + self.token_time - now
|
17
|
+
if wait_time > 0:
|
18
|
+
await asyncio.sleep(wait_time)
|
19
|
+
self.last_time = time.time()
|
20
|
+
return True
|
21
|
+
|
22
|
+
class XBRLRetriever:
|
23
|
+
def __init__(self):
|
24
|
+
self.base_url = "https://data.sec.gov/api/xbrl/frames"
|
25
|
+
self.headers = {
|
26
|
+
'User-Agent': 'Your Name yourname@email.com',
|
27
|
+
'Accept-Encoding': 'gzip, deflate',
|
28
|
+
'Host': 'data.sec.gov'
|
29
|
+
}
|
30
|
+
self.session = None
|
31
|
+
self.limiter = PreciseRateLimiter(10)
|
32
|
+
|
33
|
+
async def __aenter__(self):
|
34
|
+
if not self.session:
|
35
|
+
self.session = aiohttp.ClientSession(headers=self.headers)
|
36
|
+
return self
|
37
|
+
|
38
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
39
|
+
if self.session:
|
40
|
+
await self.session.close()
|
41
|
+
self.session = None
|
42
|
+
|
43
|
+
async def _fetch_json(self, url):
|
44
|
+
await self.limiter.acquire()
|
45
|
+
try:
|
46
|
+
async with self.session.get(url) as response:
|
47
|
+
if response.status == 429:
|
48
|
+
await asyncio.sleep(61)
|
49
|
+
return await self._fetch_json(url)
|
50
|
+
elif response.status == 200:
|
51
|
+
return await response.json()
|
52
|
+
else:
|
53
|
+
print(f"Error {response.status} for URL: {url}")
|
54
|
+
return None
|
55
|
+
except Exception as e:
|
56
|
+
print(f"Error fetching {url}: {str(e)}")
|
57
|
+
return None
|
58
|
+
|
59
|
+
def _build_url(self, params):
|
60
|
+
taxonomy = params.get('taxonomy')
|
61
|
+
concept = params.get('concept')
|
62
|
+
unit = params.get('unit')
|
63
|
+
period = params.get('period')
|
64
|
+
|
65
|
+
if not all([taxonomy, concept, unit, period]):
|
66
|
+
raise ValueError("Missing required parameters")
|
67
|
+
|
68
|
+
return f"{self.base_url}/{taxonomy}/{concept}/{unit}/{period}.json"
|
69
|
+
|
70
|
+
async def _get_xbrl_data(self, params_list):
|
71
|
+
tasks = []
|
72
|
+
urls = {}
|
73
|
+
|
74
|
+
for params in params_list:
|
75
|
+
url = self._build_url(params)
|
76
|
+
urls[url] = params
|
77
|
+
tasks.append(self._fetch_json(url))
|
78
|
+
|
79
|
+
results = await asyncio.gather(*tasks)
|
80
|
+
|
81
|
+
return {url: result for url, result in zip(urls.keys(), results) if result is not None}
|
82
|
+
|
83
|
+
def get_xbrl_frames(self, params_list):
|
84
|
+
async def _download():
|
85
|
+
async with self as downloader:
|
86
|
+
return await self._get_xbrl_data(params_list)
|
87
|
+
|
88
|
+
return asyncio.run(_download())
|
@@ -0,0 +1,279 @@
|
|
1
|
+
import json
|
2
|
+
import csv
|
3
|
+
from .helper import convert_to_dashed_accession
|
4
|
+
import re
|
5
|
+
from doc2dict import xml2dict, txt2dict, dict2dict
|
6
|
+
from doc2dict.mapping import flatten_hierarchy
|
7
|
+
from .mapping_dicts import txt_mapping_dicts
|
8
|
+
from .mapping_dicts import xml_mapping_dicts
|
9
|
+
from selectolax.parser import HTMLParser
|
10
|
+
|
11
|
+
class Document:
|
12
|
+
def __init__(self, type, filename):
|
13
|
+
self.type = type
|
14
|
+
self.path = filename
|
15
|
+
|
16
|
+
self.data = None
|
17
|
+
self.content = None
|
18
|
+
|
19
|
+
|
20
|
+
def load_content(self,encoding='utf-8'):
|
21
|
+
with open(self.path, 'r',encoding=encoding) as f:
|
22
|
+
self.content = f.read()
|
23
|
+
|
24
|
+
def _load_text_content(self):
|
25
|
+
with open(self.path) as f:
|
26
|
+
return f.read().translate(str.maketrans({
|
27
|
+
'\xa0': ' ', '\u2003': ' ',
|
28
|
+
'\u2018': "'", '\u2019': "'",
|
29
|
+
'\u201c': '"', '\u201d': '"'
|
30
|
+
}))
|
31
|
+
|
32
|
+
# will deprecate this when we add html2dict
|
33
|
+
def _load_html_content(self):
|
34
|
+
with open(self.path,'rb') as f:
|
35
|
+
parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
|
36
|
+
|
37
|
+
# Remove hidden elements first
|
38
|
+
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
39
|
+
for node in hidden_nodes:
|
40
|
+
node.decompose()
|
41
|
+
|
42
|
+
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
43
|
+
lines = []
|
44
|
+
current_line = []
|
45
|
+
|
46
|
+
def flush_line():
|
47
|
+
if current_line:
|
48
|
+
# Don't add spaces between adjacent spans
|
49
|
+
lines.append(''.join(current_line))
|
50
|
+
current_line.clear()
|
51
|
+
|
52
|
+
for node in parser.root.traverse(include_text=True):
|
53
|
+
if node.tag in ('script', 'style', 'css'):
|
54
|
+
continue
|
55
|
+
|
56
|
+
if node.tag in blocks:
|
57
|
+
flush_line()
|
58
|
+
lines.append('')
|
59
|
+
|
60
|
+
if node.text_content:
|
61
|
+
text = node.text_content.strip()
|
62
|
+
if text:
|
63
|
+
if node.tag in blocks:
|
64
|
+
flush_line()
|
65
|
+
lines.append(text)
|
66
|
+
lines.append('')
|
67
|
+
else:
|
68
|
+
# Only add space if nodes aren't directly adjacent
|
69
|
+
if current_line and not current_line[-1].endswith(' '):
|
70
|
+
if node.prev and node.prev.text_content:
|
71
|
+
if node.parent != node.prev.parent or node.prev.next != node:
|
72
|
+
current_line.append(' ')
|
73
|
+
current_line.append(text)
|
74
|
+
|
75
|
+
flush_line()
|
76
|
+
|
77
|
+
text = '\n'.join(lines)
|
78
|
+
while '\n\n\n' in text:
|
79
|
+
text = text.replace('\n\n\n', '\n\n')
|
80
|
+
|
81
|
+
return text.translate(str.maketrans({
|
82
|
+
'\xa0': ' ', '\u2003': ' ',
|
83
|
+
'\u2018': "'", '\u2019': "'",
|
84
|
+
'\u201c': '"', '\u201d': '"'
|
85
|
+
}))
|
86
|
+
|
87
|
+
def _load_file_content(self):
|
88
|
+
if self.path.suffix =='.txt':
|
89
|
+
self.content = self._load_text_content()
|
90
|
+
elif self.path.suffix in ['.html','.htm']:
|
91
|
+
self.content = self._load_html_content()
|
92
|
+
else:
|
93
|
+
raise ValueError(f"Unsupported file type: {self.path.suffix}")
|
94
|
+
|
95
|
+
|
96
|
+
def contains_string(self, pattern):
|
97
|
+
"""Currently only works for .htm, .html, and .txt files"""
|
98
|
+
if self.path.suffix in ['.htm', '.html', '.txt']:
|
99
|
+
if self.content is None:
|
100
|
+
self.content = self._load_file_content(self.path)
|
101
|
+
return bool(re.search(pattern, self.content))
|
102
|
+
return False
|
103
|
+
|
104
|
+
# Note: this method will be heavily modified in the future
|
105
|
+
def parse(self):
|
106
|
+
mapping_dict = None
|
107
|
+
|
108
|
+
if self.path.suffix == '.xml':
|
109
|
+
if self.type in ['3', '4', '5']:
|
110
|
+
mapping_dict = xml_mapping_dicts.dict_345
|
111
|
+
|
112
|
+
self.load_content()
|
113
|
+
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
114
|
+
# will deprecate this when we add html2dict
|
115
|
+
elif self.path.suffix in ['.htm', '.html','.txt']:
|
116
|
+
self._load_file_content()
|
117
|
+
|
118
|
+
if self.type == '10-K':
|
119
|
+
mapping_dict = txt_mapping_dicts.dict_10k
|
120
|
+
elif self.type == '10-Q':
|
121
|
+
mapping_dict = txt_mapping_dicts.dict_10q
|
122
|
+
elif self.type == '8-K':
|
123
|
+
mapping_dict = txt_mapping_dicts.dict_8k
|
124
|
+
elif self.type == 'SC 13D':
|
125
|
+
mapping_dict = txt_mapping_dicts.dict_13d
|
126
|
+
elif self.type == 'SC 13G':
|
127
|
+
mapping_dict = txt_mapping_dicts.dict_13g
|
128
|
+
|
129
|
+
self.data = {}
|
130
|
+
self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
|
131
|
+
return self.data
|
132
|
+
|
133
|
+
def write_json(self, output_filename=None):
|
134
|
+
if not self.data:
|
135
|
+
self.parse()
|
136
|
+
|
137
|
+
if output_filename is None:
|
138
|
+
output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
|
139
|
+
|
140
|
+
with open(output_filename, 'w',encoding='utf-8') as f:
|
141
|
+
json.dump(self.data, f, indent=2)
|
142
|
+
|
143
|
+
def write_csv(self, output_filename=None, accession_number=None):
|
144
|
+
self.parse()
|
145
|
+
|
146
|
+
if output_filename is None:
|
147
|
+
output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
|
148
|
+
|
149
|
+
with open(output_filename, 'w', newline='') as csvfile:
|
150
|
+
if not self.data:
|
151
|
+
return output_filename
|
152
|
+
|
153
|
+
has_document = any('document' in item for item in self.data)
|
154
|
+
|
155
|
+
if has_document and 'document' in self.data:
|
156
|
+
writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
|
157
|
+
writer.writeheader()
|
158
|
+
flattened = self._flatten_dict(self.data['document'])
|
159
|
+
for section, text in flattened.items():
|
160
|
+
writer.writerow({'section': section, 'text': text})
|
161
|
+
else:
|
162
|
+
fieldnames = list(self.data[0].keys())
|
163
|
+
if accession_number:
|
164
|
+
fieldnames.append('Accession Number')
|
165
|
+
writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
|
166
|
+
writer.writeheader()
|
167
|
+
for row in self.data:
|
168
|
+
if accession_number:
|
169
|
+
row['Accession Number'] = convert_to_dashed_accession(accession_number)
|
170
|
+
writer.writerow(row)
|
171
|
+
|
172
|
+
return output_filename
|
173
|
+
|
174
|
+
def _document_to_section_text(self, document_data, parent_key=''):
|
175
|
+
items = []
|
176
|
+
|
177
|
+
if isinstance(document_data, dict):
|
178
|
+
for key, value in document_data.items():
|
179
|
+
# Build the section name
|
180
|
+
section = f"{parent_key}_{key}" if parent_key else key
|
181
|
+
|
182
|
+
# If the value is a dict, recurse
|
183
|
+
if isinstance(value, dict):
|
184
|
+
items.extend(self._document_to_section_text(value, section))
|
185
|
+
# If it's a list, handle each item
|
186
|
+
elif isinstance(value, list):
|
187
|
+
for i, item in enumerate(value):
|
188
|
+
if isinstance(item, dict):
|
189
|
+
items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
|
190
|
+
else:
|
191
|
+
items.append({
|
192
|
+
'section': f"{section}_{i+1}",
|
193
|
+
'text': str(item)
|
194
|
+
})
|
195
|
+
# Base case - add the item
|
196
|
+
else:
|
197
|
+
items.append({
|
198
|
+
'section': section,
|
199
|
+
'text': str(value)
|
200
|
+
})
|
201
|
+
|
202
|
+
return items
|
203
|
+
|
204
|
+
# we'll modify this for every dict
|
205
|
+
def _flatten_dict(self, d, parent_key=''):
|
206
|
+
items = {}
|
207
|
+
|
208
|
+
if isinstance(d, list):
|
209
|
+
return [self._flatten_dict(item) for item in d]
|
210
|
+
|
211
|
+
for k, v in d.items():
|
212
|
+
new_key = f"{parent_key}_{k}" if parent_key else k
|
213
|
+
|
214
|
+
if isinstance(v, dict):
|
215
|
+
items.update(self._flatten_dict(v, new_key))
|
216
|
+
else:
|
217
|
+
items[new_key] = str(v)
|
218
|
+
|
219
|
+
return items
|
220
|
+
|
221
|
+
# this will all have to be changed. default will be to flatten everything
|
222
|
+
def __iter__(self):
|
223
|
+
if not self.data:
|
224
|
+
self.parse()
|
225
|
+
|
226
|
+
# Let's remove XML iterable for now
|
227
|
+
|
228
|
+
# Handle text-based documents
|
229
|
+
if self.path.suffix in ['.txt', '.htm', '.html']:
|
230
|
+
document_data = self.data
|
231
|
+
if not document_data:
|
232
|
+
return iter([])
|
233
|
+
|
234
|
+
# Find highest hierarchy level from mapping dict
|
235
|
+
highest_hierarchy = float('inf')
|
236
|
+
section_type = None
|
237
|
+
|
238
|
+
if self.type in ['10-K', '10-Q']:
|
239
|
+
mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
|
240
|
+
elif self.type == '8-K':
|
241
|
+
mapping_dict = txt_mapping_dicts.dict_8k
|
242
|
+
elif self.type == 'SC 13D':
|
243
|
+
mapping_dict = txt_mapping_dicts.dict_13d
|
244
|
+
elif self.type == 'SC 13G':
|
245
|
+
mapping_dict = txt_mapping_dicts.dict_13g
|
246
|
+
else:
|
247
|
+
return iter([])
|
248
|
+
|
249
|
+
# Find section type with highest hierarchy number
|
250
|
+
highest_hierarchy = -1 # Start at -1 to find highest
|
251
|
+
for mapping in mapping_dict['rules']['mappings']:
|
252
|
+
if mapping.get('hierarchy') is not None:
|
253
|
+
if mapping['hierarchy'] > highest_hierarchy:
|
254
|
+
highest_hierarchy = mapping['hierarchy']
|
255
|
+
section_type = mapping['name']
|
256
|
+
|
257
|
+
if not section_type:
|
258
|
+
return iter([])
|
259
|
+
|
260
|
+
# Extract sections of the identified type
|
261
|
+
def find_sections(data, target_type):
|
262
|
+
sections = []
|
263
|
+
if isinstance(data, dict):
|
264
|
+
if data.get('type') == target_type:
|
265
|
+
sections.append({
|
266
|
+
'item': data.get('text', ''),
|
267
|
+
'text': flatten_hierarchy(data.get('content', []))
|
268
|
+
})
|
269
|
+
for value in data.values():
|
270
|
+
if isinstance(value, (dict, list)):
|
271
|
+
sections.extend(find_sections(value, target_type))
|
272
|
+
elif isinstance(data, list):
|
273
|
+
for item in data:
|
274
|
+
sections.extend(find_sections(item, target_type))
|
275
|
+
return sections
|
276
|
+
|
277
|
+
return iter(find_sections(document_data, section_type))
|
278
|
+
|
279
|
+
return iter([])
|
@@ -10,7 +10,7 @@ import time
|
|
10
10
|
from collections import deque
|
11
11
|
|
12
12
|
from ..helper import identifier_to_cik, load_package_csv, fix_filing_url, headers
|
13
|
-
from
|
13
|
+
from secsgml import parse_sgml_submission
|
14
14
|
|
15
15
|
class RetryException(Exception):
|
16
16
|
def __init__(self, url, retry_after=601):
|
@@ -122,8 +122,8 @@ class Downloader:
|
|
122
122
|
raise RetryException(url)
|
123
123
|
raise
|
124
124
|
|
125
|
-
async def _get_filing_urls_from_efts(self, base_url):
|
126
|
-
"""Fetch filing URLs from EFTS in batches."""
|
125
|
+
async def _get_filing_urls_from_efts(self, base_url, submission_type=None):
|
126
|
+
"""Fetch filing URLs from EFTS in batches with form type filtering."""
|
127
127
|
start = 0
|
128
128
|
page_size = 100
|
129
129
|
urls = []
|
@@ -152,12 +152,22 @@ class Downloader:
|
|
152
152
|
if data and 'hits' in data:
|
153
153
|
hits = data['hits']['hits']
|
154
154
|
if hits:
|
155
|
+
# Filter hits based on exact form match
|
156
|
+
if not submission_type or submission_type == "-0":
|
157
|
+
filtered_hits = hits
|
158
|
+
else:
|
159
|
+
requested_forms = [submission_type] if isinstance(submission_type, str) else submission_type
|
160
|
+
filtered_hits = [
|
161
|
+
hit for hit in hits
|
162
|
+
if hit['_source'].get('form', '') in requested_forms
|
163
|
+
]
|
164
|
+
|
155
165
|
batch_urls = [
|
156
166
|
f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}.txt"
|
157
|
-
for hit in
|
167
|
+
for hit in filtered_hits
|
158
168
|
]
|
159
169
|
urls.extend(batch_urls)
|
160
|
-
pbar.update(len(hits))
|
170
|
+
pbar.update(len(hits)) # Update progress based on total hits processed
|
161
171
|
self.update_progress_description()
|
162
172
|
|
163
173
|
start += 10 * page_size
|
@@ -173,7 +183,7 @@ class Downloader:
|
|
173
183
|
pbar.close()
|
174
184
|
self.current_pbar = None
|
175
185
|
return urls
|
176
|
-
|
186
|
+
|
177
187
|
async def _download_file(self, url, filepath):
|
178
188
|
"""Download single file with precise rate limiting."""
|
179
189
|
async with self.connection_semaphore:
|
@@ -197,7 +207,7 @@ class Downloader:
|
|
197
207
|
|
198
208
|
parsed_data = parse_sgml_submission(
|
199
209
|
content=content.decode(),
|
200
|
-
output_dir=os.path.dirname(filepath)
|
210
|
+
output_dir=os.path.dirname(filepath)
|
201
211
|
)
|
202
212
|
|
203
213
|
try:
|
@@ -306,7 +316,7 @@ class Downloader:
|
|
306
316
|
base_url = "https://efts.sec.gov/LATEST/search-index"
|
307
317
|
efts_url = f"{base_url}?{urlencode(params, doseq=True)}"
|
308
318
|
|
309
|
-
urls = await self._get_filing_urls_from_efts(efts_url)
|
319
|
+
urls = await self._get_filing_urls_from_efts(efts_url,submission_type)
|
310
320
|
if urls:
|
311
321
|
filepaths, parsed_data = await self._download_and_process(urls, output_dir)
|
312
322
|
all_filepaths.extend(filepaths)
|