datamule 0.429__cp39-cp39-macosx_10_9_universal2.whl → 0.430__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document.py +3 -0
- datamule/downloader/premiumdownloader.py +2 -1
- datamule/parser/document_parsing/helper.py +1 -1
- datamule/parser/sgml_parsing/sgml_parser_cy.cpython-39-darwin.so +0 -0
- datamule/portfolio.py +63 -50
- {datamule-0.429.dist-info → datamule-0.430.dist-info}/METADATA +3 -3
- {datamule-0.429.dist-info → datamule-0.430.dist-info}/RECORD +9 -9
- {datamule-0.429.dist-info → datamule-0.430.dist-info}/WHEEL +1 -1
- {datamule-0.429.dist-info → datamule-0.430.dist-info}/top_level.txt +0 -0
datamule/document.py
CHANGED
@@ -17,6 +17,9 @@ class Document:
|
|
17
17
|
self.data = None
|
18
18
|
self.content = None
|
19
19
|
|
20
|
+
def _load_content(self):
|
21
|
+
self.content = load_file_content(self.path)
|
22
|
+
|
20
23
|
def contains_string(self, pattern):
|
21
24
|
"""Currently only works for .htm, .html, and .txt files"""
|
22
25
|
if self.path.suffix in ['.htm', '.html', '.txt']:
|
@@ -259,7 +259,8 @@ class PremiumDownloader:
|
|
259
259
|
keepalive_timeout=60
|
260
260
|
)
|
261
261
|
|
262
|
-
|
262
|
+
# timeout should be max 2 hours.
|
263
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=7200)) as session:
|
263
264
|
tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
|
264
265
|
await asyncio.gather(*tasks, return_exceptions=True)
|
265
266
|
|
@@ -62,7 +62,7 @@ def load_file_content(filename):
|
|
62
62
|
elif filename.suffix in ['.html','.htm']:
|
63
63
|
return load_html_content(filename)
|
64
64
|
else:
|
65
|
-
raise ValueError(f"Unsupported file type: {filename}")
|
65
|
+
raise ValueError(f"Unsupported file type: {filename.suffix}")
|
66
66
|
|
67
67
|
def clean_title(title: str) -> str:
|
68
68
|
"""Clean up section title by removing newlines, periods, and all whitespace, converting to lowercase."""
|
Binary file
|
datamule/portfolio.py
CHANGED
@@ -1,82 +1,95 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from tqdm import tqdm
|
3
|
-
from concurrent.futures import
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
4
4
|
from .submission import Submission
|
5
5
|
from .downloader.premiumdownloader import PremiumDownloader
|
6
6
|
from .downloader.downloader import Downloader
|
7
7
|
from .config import Config
|
8
|
+
import os
|
8
9
|
|
9
10
|
class Portfolio:
|
10
|
-
def
|
11
|
-
# This method handles the process pool lifecycle
|
12
|
-
with ProcessPoolExecutor() as executor:
|
13
|
-
portfolio = cls(path, executor)
|
14
|
-
return portfolio
|
15
|
-
|
16
|
-
def __init__(self, path, executor=None):
|
11
|
+
def __init__(self, path):
|
17
12
|
self.path = Path(path)
|
18
|
-
|
13
|
+
self.submissions = []
|
14
|
+
self.MAX_WORKERS = os.cpu_count() - 1
|
15
|
+
|
19
16
|
if self.path.exists():
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
17
|
+
self._load_submissions()
|
18
|
+
|
19
|
+
def _load_submissions(self):
|
20
|
+
folders = [f for f in self.path.iterdir() if f.is_dir()]
|
21
|
+
print(f"Loading {len(folders)} submissions")
|
22
|
+
|
23
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
24
|
+
self.submissions = list(tqdm(
|
25
|
+
executor.map(Submission, folders),
|
26
|
+
total=len(folders),
|
27
|
+
desc="Loading submissions"
|
28
|
+
))
|
29
|
+
|
30
|
+
def process_submissions(self, callback):
|
31
|
+
"""Process all submissions using a thread pool."""
|
32
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
33
|
+
results = list(tqdm(
|
34
|
+
executor.map(callback, self.submissions),
|
35
|
+
total=len(self.submissions),
|
36
|
+
desc="Processing submissions"
|
37
|
+
))
|
38
|
+
return results
|
39
|
+
|
40
|
+
def process_documents(self, callback):
|
41
|
+
"""Process all documents using a thread pool."""
|
42
|
+
documents = [doc for sub in self.submissions for doc in sub]
|
43
|
+
|
44
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
45
|
+
results = list(tqdm(
|
46
|
+
executor.map(callback, documents),
|
47
|
+
total=len(documents),
|
48
|
+
desc="Processing documents"
|
49
|
+
))
|
50
|
+
return results
|
36
51
|
|
37
52
|
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
|
38
53
|
if provider is None:
|
39
54
|
config = Config()
|
40
55
|
provider = config.get_default_source()
|
41
56
|
|
42
|
-
if provider == '
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
57
|
+
downloader = PremiumDownloader() if provider == 'datamule' else Downloader()
|
58
|
+
downloader.download_submissions(
|
59
|
+
output_dir=self.path,
|
60
|
+
cik=cik,
|
61
|
+
ticker=ticker,
|
62
|
+
submission_type=submission_type,
|
63
|
+
filing_date=filing_date
|
64
|
+
)
|
49
65
|
|
50
66
|
# Reload submissions after download
|
51
|
-
self.
|
52
|
-
|
67
|
+
self._load_submissions()
|
68
|
+
|
53
69
|
def __iter__(self):
|
54
70
|
return iter(self.submissions)
|
55
71
|
|
56
72
|
def document_type(self, document_types):
|
57
|
-
|
73
|
+
"""Filter documents by type(s)."""
|
58
74
|
if isinstance(document_types, str):
|
59
75
|
document_types = [document_types]
|
60
76
|
|
61
77
|
for submission in self.submissions:
|
62
78
|
yield from submission.document_type(document_types)
|
63
79
|
|
64
|
-
def contains_string(self, pattern, document_types=None
|
80
|
+
def contains_string(self, pattern, document_types=None):
|
81
|
+
"""Search for pattern in documents, with optional type filter."""
|
65
82
|
def check_document(document):
|
66
83
|
return document if document.contains_string(pattern) else None
|
67
84
|
|
68
|
-
documents
|
69
|
-
|
70
|
-
|
85
|
+
# Get documents, filtered by type if specified
|
86
|
+
documents = list(self.document_type(document_types)) if document_types else [
|
87
|
+
doc for sub in self.submissions for doc in sub
|
88
|
+
]
|
71
89
|
|
72
|
-
|
73
|
-
results =
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
yield from (doc for doc in results if doc is not None)
|
79
|
-
else:
|
80
|
-
for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
|
81
|
-
if document.contains_string(pattern):
|
82
|
-
yield document
|
90
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
91
|
+
results = executor.map(check_document, documents)
|
92
|
+
|
93
|
+
for doc in tqdm(results, total=len(documents), desc=f"Searching for '{pattern}'"):
|
94
|
+
if doc is not None:
|
95
|
+
yield doc
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.430
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -24,8 +24,8 @@ Requires-Dist: pandas; extra == "dataset-builder"
|
|
24
24
|
Requires-Dist: google-generativeai; extra == "dataset-builder"
|
25
25
|
Requires-Dist: psutil; extra == "dataset-builder"
|
26
26
|
Provides-Extra: all
|
27
|
-
Requires-Dist: openai; extra == "all"
|
28
|
-
Requires-Dist: psutil; extra == "all"
|
29
27
|
Requires-Dist: pandas; extra == "all"
|
28
|
+
Requires-Dist: psutil; extra == "all"
|
30
29
|
Requires-Dist: google-generativeai; extra == "all"
|
30
|
+
Requires-Dist: openai; extra == "all"
|
31
31
|
Requires-Dist: flask; extra == "all"
|
@@ -1,10 +1,10 @@
|
|
1
1
|
datamule/__init__.py,sha256=qRl4jjMTyekdRV16KOyA4_DoqB-pBi8hMxaUDo88Vu8,2352
|
2
2
|
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
-
datamule/document.py,sha256=
|
3
|
+
datamule/document.py,sha256=LIUaDUS-mBigCghnecEespNnFwKsVj6pwpZIyusMq-M,5715
|
4
4
|
datamule/helper.py,sha256=tr3AQWus9dHNZFKpLSglWjcb8zmm5qDXjOWACMhvMxQ,4594
|
5
5
|
datamule/monitor.py,sha256=mRaM8v5NgcMF9DJ1s_YBzucjrbr-3yFwW422MVml-_Q,9114
|
6
6
|
datamule/packageupdater.py,sha256=vEGqlTj6FudIeVHBVJltPh2eBDEqMG9HYmnyrRVKeSU,9595
|
7
|
-
datamule/portfolio.py,sha256=
|
7
|
+
datamule/portfolio.py,sha256=CNGhNpk8JwL85CCX7RYheTevFB-eGB0_VK02Bo_t9Mc,3566
|
8
8
|
datamule/submission.py,sha256=j-uIuwHIvRheXo5RcDf1GYVeYbdjq7yfoU1-GKTH2f8,2790
|
9
9
|
datamule/data/company_former_names.csv,sha256=HE9cAv-_QKFX6jT-_-D0rHmaDyQuAzL4MJwank5O1U8,706380
|
10
10
|
datamule/data/company_metadata.csv,sha256=yPovrCVjYwLWTU_hBUFJymp8iNO0NBYuq_QwOkRLoN8,3068599
|
@@ -13,7 +13,7 @@ datamule/data/sec-glossary.csv,sha256=-cN7GjiadLw5C1sv4zSeCnfeZZDYeSgJl-0ydarMAo
|
|
13
13
|
datamule/data/xbrl_descriptions.csv,sha256=SQ9wUURNqG424rnTiZtopsxV2q-PvU4NMj52LqgDsvg,2621524
|
14
14
|
datamule/dataset_builder/dataset_builder.py,sha256=NCvNbDwlEkA_eAbqbsG--YlqPBDREFTVSM1GJquR0RE,9747
|
15
15
|
datamule/downloader/downloader.py,sha256=Cw8FAc0oi8H4okK6pVuS9hkAGoXR72fV3xAdF83J4v4,14558
|
16
|
-
datamule/downloader/premiumdownloader.py,sha256=
|
16
|
+
datamule/downloader/premiumdownloader.py,sha256=WyR0lHHva9txCYzf-OrZ2o-ScPxDBtjAfnB5ebUpzjk,14251
|
17
17
|
datamule/mulebot/__init__.py,sha256=YvZXV6xQ0iP-oGD8rloufjdwJL6D46P3NNr0CY9PQCA,29
|
18
18
|
datamule/mulebot/helper.py,sha256=olztOwltfELZ-IERM2bRNLBavD04kfB6ueWTisJAleA,1080
|
19
19
|
datamule/mulebot/mulebot.py,sha256=XbtgvXBSFu9OaaLW_k1KDgHVTNQGV8_0ZwNMFad-pPU,5837
|
@@ -39,7 +39,7 @@ datamule/parser/document_parsing/basic_13g_parser.py,sha256=sWg83-QTAzUDNs45iWtp
|
|
39
39
|
datamule/parser/document_parsing/basic_8k_parser.py,sha256=inCSmlH_BkLK0Lkvt0kZ6EUJ0nijul_RkdXzccyOmRI,2466
|
40
40
|
datamule/parser/document_parsing/form_d_parser.py,sha256=dWlGeVZRzh0kfT3gVMC8eyqeQORdVV3r8KXUwEqAW3s,2036
|
41
41
|
datamule/parser/document_parsing/generalized_item_parser.py,sha256=67_DFb1BQbMmdHefEgoCPlEoiUT0zyxh3eBNJpjGXUk,2616
|
42
|
-
datamule/parser/document_parsing/helper.py,sha256=
|
42
|
+
datamule/parser/document_parsing/helper.py,sha256=sIddTOeX3yjCXlJeW8_2-10NU9omDJrG9HyYdXisB1I,2540
|
43
43
|
datamule/parser/document_parsing/information_table_parser_13fhr.py,sha256=R4Up1oDx3xAlzHwXzVzUkdOSsk8YPuJBPS_3I_bNQSE,1767
|
44
44
|
datamule/parser/document_parsing/insider_trading_parser.py,sha256=OVQDeLcfaZtgmOWvWPDotftO6jxx-doFAqBYVqNgypo,7106
|
45
45
|
datamule/parser/document_parsing/mappings.py,sha256=VKdnT3C5yPTbB4ZBa4El4jnB-6_osomm2rbJx6Ac6HE,5286
|
@@ -47,8 +47,8 @@ datamule/parser/document_parsing/n_port_p_parser.py,sha256=GmmQFkCZt57WikUZ5Daht
|
|
47
47
|
datamule/parser/document_parsing/sec_parser.py,sha256=AS8H4h1sfUAdWP2gotULcjbylsYN_nHgTfkeVRyENPo,2716
|
48
48
|
datamule/parser/document_parsing/sgml_parser.py,sha256=tC1cL3cdVQPWbc9QtoRUYSo2wRuYNaglFaCmP57oEfA,3317
|
49
49
|
datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=P36bPkmVEmEwIhxumwJTmPslHqCaFcNhVSKPTQBQAH0,840575
|
50
|
-
datamule/parser/sgml_parsing/sgml_parser_cy.cpython-39-darwin.so,sha256=
|
51
|
-
datamule-0.
|
52
|
-
datamule-0.
|
53
|
-
datamule-0.
|
54
|
-
datamule-0.
|
50
|
+
datamule/parser/sgml_parsing/sgml_parser_cy.cpython-39-darwin.so,sha256=nPo3GthbFQ_ilWHrcthzk1v8NgSQwUpBV7g4-vluVC0,395976
|
51
|
+
datamule-0.430.dist-info/METADATA,sha256=-Mcy4T0s0kL-WumRJrBkGtq7nlQ-XvAuosqCe2zYfeI,1006
|
52
|
+
datamule-0.430.dist-info/WHEEL,sha256=XMF1UNSMnizwUL-B7Kuq9HqI3_zfzDKBOk80WyNIjzA,112
|
53
|
+
datamule-0.430.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
54
|
+
datamule-0.430.dist-info/RECORD,,
|
File without changes
|