datamule 0.427__cp313-cp313-win_amd64.whl → 0.428__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/__init__.py CHANGED
@@ -39,6 +39,9 @@ def __getattr__(name):
39
39
  elif name == 'load_package_dataset':
40
40
  from .helper import load_package_dataset
41
41
  return load_package_dataset
42
+ elif name == 'Config':
43
+ from .config import Config
44
+ return Config
42
45
  raise AttributeError(f"module 'datamule' has no attribute '{name}'")
43
46
 
44
47
  # Lazy load nest_asyncio only when needed
datamule/config.py ADDED
@@ -0,0 +1,29 @@
1
+ import json
2
+ import os
3
+
4
+ class Config:
5
+ def __init__(self):
6
+ self.config_path = os.path.expanduser("~/.datamule/config.json")
7
+ self._ensure_config_exists()
8
+
9
+ def _ensure_config_exists(self):
10
+ os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
11
+ if not os.path.exists(self.config_path):
12
+ self._save_config({"default_source": None})
13
+
14
+ def _save_config(self, config):
15
+ with open(self.config_path, 'w') as f:
16
+ json.dump(config, f)
17
+
18
+ def set_default_source(self, source):
19
+ config = self._load_config()
20
+ config["default_source"] = source
21
+ self._save_config(config)
22
+
23
+ def get_default_source(self):
24
+ config = self._load_config()
25
+ return config.get("default_source")
26
+
27
+ def _load_config(self):
28
+ with open(self.config_path) as f:
29
+ return json.load(f)
datamule/document.py CHANGED
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  import csv
3
3
  from .parser.document_parsing.sec_parser import Parser
4
+ from .parser.document_parsing.helper import load_file_content
4
5
  from .helper import convert_to_dashed_accession
6
+ import re
5
7
 
6
8
  # we need to modify parse filing to take option in memory
7
9
 
@@ -10,12 +12,22 @@ parser = Parser()
10
12
  class Document:
11
13
  def __init__(self, type, filename):
12
14
  self.type = type
13
- self.filename = filename
15
+ self.path = filename
14
16
 
15
17
  self.data = None
18
+ self.content = None
16
19
 
20
+ def contains_string(self, pattern):
21
+ """Currently only works for .htm, .html, and .txt files"""
22
+ if self.path.suffix in ['.htm', '.html', '.txt']:
23
+ if self.content is None:
24
+ self.content = load_file_content(self.path)
25
+ return bool(re.search(pattern, self.content))
26
+ return False
27
+
28
+ # Note: this method will be heavily modified in the future
17
29
  def parse(self):
18
- self.data = parser.parse_filing(self.filename, self.type)
30
+ self.data = parser.parse_filing(self.path, self.type)
19
31
  return self.data
20
32
 
21
33
  def write_json(self, output_filename=None):
@@ -23,7 +35,7 @@ class Document:
23
35
  raise ValueError("No data to write. Parse filing first.")
24
36
 
25
37
  if output_filename is None:
26
- output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
38
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
27
39
 
28
40
  with open(output_filename, 'w') as f:
29
41
  json.dump(self.data, f, indent=2)
@@ -33,7 +45,7 @@ class Document:
33
45
  raise ValueError("No data available. Please call parse_filing() first.")
34
46
 
35
47
  if output_filename is None:
36
- output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
48
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
37
49
 
38
50
  with open(output_filename, 'w', newline='') as csvfile:
39
51
  if not self.data:
datamule/portfolio.py CHANGED
@@ -2,9 +2,11 @@ from pathlib import Path
2
2
  from tqdm import tqdm
3
3
  from concurrent.futures import ProcessPoolExecutor
4
4
  from .submission import Submission
5
+ from .downloader.premiumdownloader import PremiumDownloader
6
+ from .downloader.downloader import Downloader
7
+ from .config import Config
5
8
 
6
9
  class Portfolio:
7
- @classmethod
8
10
  def create(cls, path):
9
11
  # This method handles the process pool lifecycle
10
12
  with ProcessPoolExecutor() as executor:
@@ -13,19 +15,64 @@ class Portfolio:
13
15
 
14
16
  def __init__(self, path, executor=None):
15
17
  self.path = Path(path)
16
- folders = [f for f in self.path.iterdir() if f.is_dir()]
17
- print(f"Loading {len(folders)} submissions")
18
-
19
- if executor is None:
20
- # Fall back to sequential loading if no executor
21
- self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
18
+ # check if path exists
19
+ if self.path.exists():
20
+ folders = [f for f in self.path.iterdir() if f.is_dir()]
21
+ print(f"Loading {len(folders)} submissions")
22
+
23
+ if executor is None:
24
+ # Fall back to sequential loading if no executor
25
+ self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
26
+ else:
27
+ # Use provided executor for parallel loading
28
+ self.submissions = list(tqdm(
29
+ executor.map(Submission, folders),
30
+ total=len(folders),
31
+ desc="Loading submissions"
32
+ ))
33
+
22
34
  else:
23
- # Use provided executor for parallel loading
24
- self.submissions = list(tqdm(
25
- executor.map(Submission, folders),
26
- total=len(folders),
27
- desc="Loading submissions"
28
- ))
35
+ pass
36
+
37
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
38
+ if provider is None:
39
+ config = Config()
40
+ provider = config.get_default_source()
29
41
 
42
+ if provider == 'sec':
43
+ downloader = Downloader()
44
+ elif provider == 'datamule':
45
+ downloader = PremiumDownloader()
46
+
47
+ downloader.download_submissions(output_dir=self.path, cik=cik, ticker=ticker, submission_type=submission_type, filing_date=filing_date
48
+ )
30
49
  def __iter__(self):
31
- return iter(self.submissions)
50
+ return iter(self.submissions)
51
+
52
+ def document_type(self, document_types):
53
+ # Convert single document type to list for consistent handling
54
+ if isinstance(document_types, str):
55
+ document_types = [document_types]
56
+
57
+ for submission in self.submissions:
58
+ yield from submission.document_type(document_types)
59
+
60
+ def contains_string(self, pattern, document_types=None, executor=None):
61
+ def check_document(document):
62
+ return document if document.contains_string(pattern) else None
63
+
64
+ documents = list(self.document_type(document_types) if document_types else (
65
+ doc for sub in tqdm(self.submissions, desc="Collecting documents") for doc in sub
66
+ ))
67
+
68
+ if executor:
69
+ results = list(tqdm(
70
+ executor.map(check_document, documents),
71
+ total=len(documents),
72
+ desc=f"Searching for '{pattern}'"
73
+ ))
74
+ yield from (doc for doc in results if doc is not None)
75
+ else:
76
+ for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
77
+ if document.contains_string(pattern):
78
+ yield document
datamule/submission.py CHANGED
@@ -64,4 +64,13 @@ class Submission:
64
64
  continue
65
65
 
66
66
  document_path = self.path / filename
67
- yield Document(doc['TYPE'], document_path)
67
+ yield Document(doc['TYPE'], document_path)
68
+
69
+ def __iter__(self):
70
+ for doc in self.metadata['documents']:
71
+ filename = doc.get('FILENAME')
72
+ if filename is None:
73
+ continue
74
+
75
+ document_path = self.path / filename
76
+ yield Document(doc['TYPE'], document_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.427
3
+ Version: 0.428
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -24,8 +24,8 @@ Requires-Dist: pandas; extra == "dataset-builder"
24
24
  Requires-Dist: google-generativeai; extra == "dataset-builder"
25
25
  Requires-Dist: psutil; extra == "dataset-builder"
26
26
  Provides-Extra: all
27
+ Requires-Dist: flask; extra == "all"
27
28
  Requires-Dist: google-generativeai; extra == "all"
29
+ Requires-Dist: pandas; extra == "all"
28
30
  Requires-Dist: psutil; extra == "all"
29
- Requires-Dist: flask; extra == "all"
30
31
  Requires-Dist: openai; extra == "all"
31
- Requires-Dist: pandas; extra == "all"
@@ -1,10 +1,11 @@
1
- datamule/__init__.py,sha256=Li3iau_u87wQQhoPliSTTpGaf3OMf5jIvqtHFJmCvnw,2338
2
- datamule/document.py,sha256=6xEaI-32AQiBxX3gZcX4Qr49bgvcvLviFAwUGpTwtr0,5273
1
+ datamule/__init__.py,sha256=ghCMkcNrtQ2dYz9ulnlZ0JSe-aJF0YSVa3da2g0eIWk,2425
2
+ datamule/config.py,sha256=lrXwhbWFAF3eTa6B4OQgvexSYvaFa-EkWofpLn6AKZM,911
3
+ datamule/document.py,sha256=NoJS6Q9f7Z2bZzKkaoxooYHdDwqs-TCw_BkVH3krtIU,5774
3
4
  datamule/helper.py,sha256=8HOjB3Y7svw_zjEY-AY5JKOJ-LrBiuQMPyok3MH6CCg,4716
4
5
  datamule/monitor.py,sha256=WVds1HGV_ojYgWmo0b4Dsiv9mzZ85HHnCucH-7XoUw8,9350
5
6
  datamule/packageupdater.py,sha256=X73XlXs9bYSPiwtceaSOEq6wSCqKoG8lyhNyuha6aG0,9801
6
- datamule/portfolio.py,sha256=OOlu_05S88uwg5Me_EpY3C7BQ93Yq9eE0_tMY_Gqyrw,1117
7
- datamule/submission.py,sha256=sB6tidsAdaqP5VIQEFPq6PjLTgmD-crgdNviaOpiqlU,2558
7
+ datamule/portfolio.py,sha256=gKnudwDS-_Qtm0j9YtfGg2mhk_-TVg2h7Ax-faIYqsY,3193
8
+ datamule/submission.py,sha256=cfX7fvHQBObff0N1jzikCGTuAUE1bGIyqenLRxch9eg,2865
8
9
  datamule/data/company_former_names.csv,sha256=zTBWdV12_JE3aROFOMrFNTHLPW_M4TDruxtl15-XfA0,714528
9
10
  datamule/data/company_metadata.csv,sha256=X7uSIwConqC0sz-moIhXIISg6FI7GLGSlvAfDDf8Sd0,3078648
10
11
  datamule/data/company_tickers.csv,sha256=ihU6aNFriN0lADloCO85Op04deFk3qVcLZ0EJhi5QVo,410362
@@ -46,8 +47,8 @@ datamule/parser/document_parsing/n_port_p_parser.py,sha256=T6GliMm-TETPsFM-hDKt1
46
47
  datamule/parser/document_parsing/sec_parser.py,sha256=YewOdOsi0P25teQuxS5DNEND9ZCyxE2ewK1DoP9mPto,2788
47
48
  datamule/parser/document_parsing/sgml_parser.py,sha256=ASpe1SzgPj4qk0VOmmuMiEQeatjcwZzsuO3MvsYCHhc,3410
48
49
  datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=66QwBAmhxkKdhCgCjOkg29umbIgQoK4T5_mmMy3NkkM,841089
49
- datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd,sha256=S_2QAcI8mDzM4hijIuz-O17QwDRXY6vFJdq6mO_ymx0,132096
50
- datamule-0.427.dist-info/METADATA,sha256=T40XFqzuBmDa2cjcwXumRdPV-H_T95ODY1h6QSJAE_E,1037
51
- datamule-0.427.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
52
- datamule-0.427.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
53
- datamule-0.427.dist-info/RECORD,,
50
+ datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd,sha256=_vSFnvAIOWLmrNXyHnQ9q65Flm0uv52J0xw2hGkhsOc,132096
51
+ datamule-0.428.dist-info/METADATA,sha256=uTujTeBT6sheV6gRv0X6WJ-x1xdoM0zAJj2tVvOVNDU,1037
52
+ datamule-0.428.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
53
+ datamule-0.428.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
54
+ datamule-0.428.dist-info/RECORD,,