datamule 0.427__cp313-cp313-win_amd64.whl → 0.428__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +3 -0
- datamule/config.py +29 -0
- datamule/document.py +16 -4
- datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd +0 -0
- datamule/portfolio.py +61 -14
- datamule/submission.py +10 -1
- {datamule-0.427.dist-info → datamule-0.428.dist-info}/METADATA +3 -3
- {datamule-0.427.dist-info → datamule-0.428.dist-info}/RECORD +10 -9
- {datamule-0.427.dist-info → datamule-0.428.dist-info}/WHEEL +0 -0
- {datamule-0.427.dist-info → datamule-0.428.dist-info}/top_level.txt +0 -0
datamule/__init__.py
CHANGED
@@ -39,6 +39,9 @@ def __getattr__(name):
|
|
39
39
|
elif name == 'load_package_dataset':
|
40
40
|
from .helper import load_package_dataset
|
41
41
|
return load_package_dataset
|
42
|
+
elif name == 'Config':
|
43
|
+
from .config import Config
|
44
|
+
return Config
|
42
45
|
raise AttributeError(f"module 'datamule' has no attribute '{name}'")
|
43
46
|
|
44
47
|
# Lazy load nest_asyncio only when needed
|
datamule/config.py
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
import json
|
2
|
+
import os
|
3
|
+
|
4
|
+
class Config:
|
5
|
+
def __init__(self):
|
6
|
+
self.config_path = os.path.expanduser("~/.datamule/config.json")
|
7
|
+
self._ensure_config_exists()
|
8
|
+
|
9
|
+
def _ensure_config_exists(self):
|
10
|
+
os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
|
11
|
+
if not os.path.exists(self.config_path):
|
12
|
+
self._save_config({"default_source": None})
|
13
|
+
|
14
|
+
def _save_config(self, config):
|
15
|
+
with open(self.config_path, 'w') as f:
|
16
|
+
json.dump(config, f)
|
17
|
+
|
18
|
+
def set_default_source(self, source):
|
19
|
+
config = self._load_config()
|
20
|
+
config["default_source"] = source
|
21
|
+
self._save_config(config)
|
22
|
+
|
23
|
+
def get_default_source(self):
|
24
|
+
config = self._load_config()
|
25
|
+
return config.get("default_source")
|
26
|
+
|
27
|
+
def _load_config(self):
|
28
|
+
with open(self.config_path) as f:
|
29
|
+
return json.load(f)
|
datamule/document.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import json
|
2
2
|
import csv
|
3
3
|
from .parser.document_parsing.sec_parser import Parser
|
4
|
+
from .parser.document_parsing.helper import load_file_content
|
4
5
|
from .helper import convert_to_dashed_accession
|
6
|
+
import re
|
5
7
|
|
6
8
|
# we need to modify parse filing to take option in memory
|
7
9
|
|
@@ -10,12 +12,22 @@ parser = Parser()
|
|
10
12
|
class Document:
|
11
13
|
def __init__(self, type, filename):
|
12
14
|
self.type = type
|
13
|
-
self.
|
15
|
+
self.path = filename
|
14
16
|
|
15
17
|
self.data = None
|
18
|
+
self.content = None
|
16
19
|
|
20
|
+
def contains_string(self, pattern):
|
21
|
+
"""Currently only works for .htm, .html, and .txt files"""
|
22
|
+
if self.path.suffix in ['.htm', '.html', '.txt']:
|
23
|
+
if self.content is None:
|
24
|
+
self.content = load_file_content(self.path)
|
25
|
+
return bool(re.search(pattern, self.content))
|
26
|
+
return False
|
27
|
+
|
28
|
+
# Note: this method will be heavily modified in the future
|
17
29
|
def parse(self):
|
18
|
-
self.data = parser.parse_filing(self.
|
30
|
+
self.data = parser.parse_filing(self.path, self.type)
|
19
31
|
return self.data
|
20
32
|
|
21
33
|
def write_json(self, output_filename=None):
|
@@ -23,7 +35,7 @@ class Document:
|
|
23
35
|
raise ValueError("No data to write. Parse filing first.")
|
24
36
|
|
25
37
|
if output_filename is None:
|
26
|
-
output_filename = f"{self.
|
38
|
+
output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
|
27
39
|
|
28
40
|
with open(output_filename, 'w') as f:
|
29
41
|
json.dump(self.data, f, indent=2)
|
@@ -33,7 +45,7 @@ class Document:
|
|
33
45
|
raise ValueError("No data available. Please call parse_filing() first.")
|
34
46
|
|
35
47
|
if output_filename is None:
|
36
|
-
output_filename = f"{self.
|
48
|
+
output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
|
37
49
|
|
38
50
|
with open(output_filename, 'w', newline='') as csvfile:
|
39
51
|
if not self.data:
|
Binary file
|
datamule/portfolio.py
CHANGED
@@ -2,9 +2,11 @@ from pathlib import Path
|
|
2
2
|
from tqdm import tqdm
|
3
3
|
from concurrent.futures import ProcessPoolExecutor
|
4
4
|
from .submission import Submission
|
5
|
+
from .downloader.premiumdownloader import PremiumDownloader
|
6
|
+
from .downloader.downloader import Downloader
|
7
|
+
from .config import Config
|
5
8
|
|
6
9
|
class Portfolio:
|
7
|
-
@classmethod
|
8
10
|
def create(cls, path):
|
9
11
|
# This method handles the process pool lifecycle
|
10
12
|
with ProcessPoolExecutor() as executor:
|
@@ -13,19 +15,64 @@ class Portfolio:
|
|
13
15
|
|
14
16
|
def __init__(self, path, executor=None):
|
15
17
|
self.path = Path(path)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
# check if path exists
|
19
|
+
if self.path.exists():
|
20
|
+
folders = [f for f in self.path.iterdir() if f.is_dir()]
|
21
|
+
print(f"Loading {len(folders)} submissions")
|
22
|
+
|
23
|
+
if executor is None:
|
24
|
+
# Fall back to sequential loading if no executor
|
25
|
+
self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
|
26
|
+
else:
|
27
|
+
# Use provided executor for parallel loading
|
28
|
+
self.submissions = list(tqdm(
|
29
|
+
executor.map(Submission, folders),
|
30
|
+
total=len(folders),
|
31
|
+
desc="Loading submissions"
|
32
|
+
))
|
33
|
+
|
22
34
|
else:
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
)
|
35
|
+
pass
|
36
|
+
|
37
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
|
38
|
+
if provider is None:
|
39
|
+
config = Config()
|
40
|
+
provider = config.get_default_source()
|
29
41
|
|
42
|
+
if provider == 'sec':
|
43
|
+
downloader = Downloader()
|
44
|
+
elif provider == 'datamule':
|
45
|
+
downloader = PremiumDownloader()
|
46
|
+
|
47
|
+
downloader.download_submissions(output_dir=self.path, cik=cik, ticker=ticker, submission_type=submission_type, filing_date=filing_date
|
48
|
+
)
|
30
49
|
def __iter__(self):
|
31
|
-
return iter(self.submissions)
|
50
|
+
return iter(self.submissions)
|
51
|
+
|
52
|
+
def document_type(self, document_types):
|
53
|
+
# Convert single document type to list for consistent handling
|
54
|
+
if isinstance(document_types, str):
|
55
|
+
document_types = [document_types]
|
56
|
+
|
57
|
+
for submission in self.submissions:
|
58
|
+
yield from submission.document_type(document_types)
|
59
|
+
|
60
|
+
def contains_string(self, pattern, document_types=None, executor=None):
|
61
|
+
def check_document(document):
|
62
|
+
return document if document.contains_string(pattern) else None
|
63
|
+
|
64
|
+
documents = list(self.document_type(document_types) if document_types else (
|
65
|
+
doc for sub in tqdm(self.submissions, desc="Collecting documents") for doc in sub
|
66
|
+
))
|
67
|
+
|
68
|
+
if executor:
|
69
|
+
results = list(tqdm(
|
70
|
+
executor.map(check_document, documents),
|
71
|
+
total=len(documents),
|
72
|
+
desc=f"Searching for '{pattern}'"
|
73
|
+
))
|
74
|
+
yield from (doc for doc in results if doc is not None)
|
75
|
+
else:
|
76
|
+
for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
|
77
|
+
if document.contains_string(pattern):
|
78
|
+
yield document
|
datamule/submission.py
CHANGED
@@ -64,4 +64,13 @@ class Submission:
|
|
64
64
|
continue
|
65
65
|
|
66
66
|
document_path = self.path / filename
|
67
|
-
yield Document(doc['TYPE'], document_path)
|
67
|
+
yield Document(doc['TYPE'], document_path)
|
68
|
+
|
69
|
+
def __iter__(self):
|
70
|
+
for doc in self.metadata['documents']:
|
71
|
+
filename = doc.get('FILENAME')
|
72
|
+
if filename is None:
|
73
|
+
continue
|
74
|
+
|
75
|
+
document_path = self.path / filename
|
76
|
+
yield Document(doc['TYPE'], document_path)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.428
|
4
4
|
Summary: Making it easier to use SEC filings.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -24,8 +24,8 @@ Requires-Dist: pandas; extra == "dataset-builder"
|
|
24
24
|
Requires-Dist: google-generativeai; extra == "dataset-builder"
|
25
25
|
Requires-Dist: psutil; extra == "dataset-builder"
|
26
26
|
Provides-Extra: all
|
27
|
+
Requires-Dist: flask; extra == "all"
|
27
28
|
Requires-Dist: google-generativeai; extra == "all"
|
29
|
+
Requires-Dist: pandas; extra == "all"
|
28
30
|
Requires-Dist: psutil; extra == "all"
|
29
|
-
Requires-Dist: flask; extra == "all"
|
30
31
|
Requires-Dist: openai; extra == "all"
|
31
|
-
Requires-Dist: pandas; extra == "all"
|
@@ -1,10 +1,11 @@
|
|
1
|
-
datamule/__init__.py,sha256=
|
2
|
-
datamule/
|
1
|
+
datamule/__init__.py,sha256=ghCMkcNrtQ2dYz9ulnlZ0JSe-aJF0YSVa3da2g0eIWk,2425
|
2
|
+
datamule/config.py,sha256=lrXwhbWFAF3eTa6B4OQgvexSYvaFa-EkWofpLn6AKZM,911
|
3
|
+
datamule/document.py,sha256=NoJS6Q9f7Z2bZzKkaoxooYHdDwqs-TCw_BkVH3krtIU,5774
|
3
4
|
datamule/helper.py,sha256=8HOjB3Y7svw_zjEY-AY5JKOJ-LrBiuQMPyok3MH6CCg,4716
|
4
5
|
datamule/monitor.py,sha256=WVds1HGV_ojYgWmo0b4Dsiv9mzZ85HHnCucH-7XoUw8,9350
|
5
6
|
datamule/packageupdater.py,sha256=X73XlXs9bYSPiwtceaSOEq6wSCqKoG8lyhNyuha6aG0,9801
|
6
|
-
datamule/portfolio.py,sha256=
|
7
|
-
datamule/submission.py,sha256=
|
7
|
+
datamule/portfolio.py,sha256=gKnudwDS-_Qtm0j9YtfGg2mhk_-TVg2h7Ax-faIYqsY,3193
|
8
|
+
datamule/submission.py,sha256=cfX7fvHQBObff0N1jzikCGTuAUE1bGIyqenLRxch9eg,2865
|
8
9
|
datamule/data/company_former_names.csv,sha256=zTBWdV12_JE3aROFOMrFNTHLPW_M4TDruxtl15-XfA0,714528
|
9
10
|
datamule/data/company_metadata.csv,sha256=X7uSIwConqC0sz-moIhXIISg6FI7GLGSlvAfDDf8Sd0,3078648
|
10
11
|
datamule/data/company_tickers.csv,sha256=ihU6aNFriN0lADloCO85Op04deFk3qVcLZ0EJhi5QVo,410362
|
@@ -46,8 +47,8 @@ datamule/parser/document_parsing/n_port_p_parser.py,sha256=T6GliMm-TETPsFM-hDKt1
|
|
46
47
|
datamule/parser/document_parsing/sec_parser.py,sha256=YewOdOsi0P25teQuxS5DNEND9ZCyxE2ewK1DoP9mPto,2788
|
47
48
|
datamule/parser/document_parsing/sgml_parser.py,sha256=ASpe1SzgPj4qk0VOmmuMiEQeatjcwZzsuO3MvsYCHhc,3410
|
48
49
|
datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=66QwBAmhxkKdhCgCjOkg29umbIgQoK4T5_mmMy3NkkM,841089
|
49
|
-
datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd,sha256=
|
50
|
-
datamule-0.
|
51
|
-
datamule-0.
|
52
|
-
datamule-0.
|
53
|
-
datamule-0.
|
50
|
+
datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd,sha256=_vSFnvAIOWLmrNXyHnQ9q65Flm0uv52J0xw2hGkhsOc,132096
|
51
|
+
datamule-0.428.dist-info/METADATA,sha256=uTujTeBT6sheV6gRv0X6WJ-x1xdoM0zAJj2tVvOVNDU,1037
|
52
|
+
datamule-0.428.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
|
53
|
+
datamule-0.428.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
54
|
+
datamule-0.428.dist-info/RECORD,,
|
File without changes
|
File without changes
|