datamule 0.428__cp310-cp310-macosx_10_9_universal2.whl → 0.430__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/document.py CHANGED
@@ -17,6 +17,9 @@ class Document:
17
17
  self.data = None
18
18
  self.content = None
19
19
 
20
+ def _load_content(self):
21
+ self.content = load_file_content(self.path)
22
+
20
23
  def contains_string(self, pattern):
21
24
  """Currently only works for .htm, .html, and .txt files"""
22
25
  if self.path.suffix in ['.htm', '.html', '.txt']:
@@ -259,7 +259,8 @@ class PremiumDownloader:
259
259
  keepalive_timeout=60
260
260
  )
261
261
 
262
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=3600)) as session:
262
+ # timeout should be max 2 hours.
263
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=7200)) as session:
263
264
  tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
264
265
  await asyncio.gather(*tasks, return_exceptions=True)
265
266
 
@@ -62,7 +62,7 @@ def load_file_content(filename):
62
62
  elif filename.suffix in ['.html','.htm']:
63
63
  return load_html_content(filename)
64
64
  else:
65
- raise ValueError(f"Unsupported file type: {filename}")
65
+ raise ValueError(f"Unsupported file type: {filename.suffix}")
66
66
 
67
67
  def clean_title(title: str) -> str:
68
68
  """Clean up section title by removing newlines, periods, and all whitespace, converting to lowercase."""
datamule/portfolio.py CHANGED
@@ -1,78 +1,95 @@
1
1
  from pathlib import Path
2
2
  from tqdm import tqdm
3
- from concurrent.futures import ProcessPoolExecutor
3
+ from concurrent.futures import ThreadPoolExecutor
4
4
  from .submission import Submission
5
5
  from .downloader.premiumdownloader import PremiumDownloader
6
6
  from .downloader.downloader import Downloader
7
7
  from .config import Config
8
+ import os
8
9
 
9
10
  class Portfolio:
10
- def create(cls, path):
11
- # This method handles the process pool lifecycle
12
- with ProcessPoolExecutor() as executor:
13
- portfolio = cls(path, executor)
14
- return portfolio
15
-
16
- def __init__(self, path, executor=None):
11
+ def __init__(self, path):
17
12
  self.path = Path(path)
18
- # check if path exists
13
+ self.submissions = []
14
+ self.MAX_WORKERS = os.cpu_count() - 1
15
+
19
16
  if self.path.exists():
20
- folders = [f for f in self.path.iterdir() if f.is_dir()]
21
- print(f"Loading {len(folders)} submissions")
22
-
23
- if executor is None:
24
- # Fall back to sequential loading if no executor
25
- self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
26
- else:
27
- # Use provided executor for parallel loading
28
- self.submissions = list(tqdm(
29
- executor.map(Submission, folders),
30
- total=len(folders),
31
- desc="Loading submissions"
32
- ))
33
-
34
- else:
35
- pass
17
+ self._load_submissions()
18
+
19
+ def _load_submissions(self):
20
+ folders = [f for f in self.path.iterdir() if f.is_dir()]
21
+ print(f"Loading {len(folders)} submissions")
22
+
23
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
24
+ self.submissions = list(tqdm(
25
+ executor.map(Submission, folders),
26
+ total=len(folders),
27
+ desc="Loading submissions"
28
+ ))
29
+
30
+ def process_submissions(self, callback):
31
+ """Process all submissions using a thread pool."""
32
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
33
+ results = list(tqdm(
34
+ executor.map(callback, self.submissions),
35
+ total=len(self.submissions),
36
+ desc="Processing submissions"
37
+ ))
38
+ return results
39
+
40
+ def process_documents(self, callback):
41
+ """Process all documents using a thread pool."""
42
+ documents = [doc for sub in self.submissions for doc in sub]
43
+
44
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
45
+ results = list(tqdm(
46
+ executor.map(callback, documents),
47
+ total=len(documents),
48
+ desc="Processing documents"
49
+ ))
50
+ return results
36
51
 
37
52
  def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
38
53
  if provider is None:
39
54
  config = Config()
40
55
  provider = config.get_default_source()
41
56
 
42
- if provider == 'sec':
43
- downloader = Downloader()
44
- elif provider == 'datamule':
45
- downloader = PremiumDownloader()
57
+ downloader = PremiumDownloader() if provider == 'datamule' else Downloader()
58
+ downloader.download_submissions(
59
+ output_dir=self.path,
60
+ cik=cik,
61
+ ticker=ticker,
62
+ submission_type=submission_type,
63
+ filing_date=filing_date
64
+ )
65
+
66
+ # Reload submissions after download
67
+ self._load_submissions()
46
68
 
47
- downloader.download_submissions(output_dir=self.path, cik=cik, ticker=ticker, submission_type=submission_type, filing_date=filing_date
48
- )
49
69
  def __iter__(self):
50
70
  return iter(self.submissions)
51
71
 
52
72
  def document_type(self, document_types):
53
- # Convert single document type to list for consistent handling
73
+ """Filter documents by type(s)."""
54
74
  if isinstance(document_types, str):
55
75
  document_types = [document_types]
56
76
 
57
77
  for submission in self.submissions:
58
78
  yield from submission.document_type(document_types)
59
79
 
60
- def contains_string(self, pattern, document_types=None, executor=None):
80
+ def contains_string(self, pattern, document_types=None):
81
+ """Search for pattern in documents, with optional type filter."""
61
82
  def check_document(document):
62
83
  return document if document.contains_string(pattern) else None
63
84
 
64
- documents = list(self.document_type(document_types) if document_types else (
65
- doc for sub in tqdm(self.submissions, desc="Collecting documents") for doc in sub
66
- ))
85
+ # Get documents, filtered by type if specified
86
+ documents = list(self.document_type(document_types)) if document_types else [
87
+ doc for sub in self.submissions for doc in sub
88
+ ]
67
89
 
68
- if executor:
69
- results = list(tqdm(
70
- executor.map(check_document, documents),
71
- total=len(documents),
72
- desc=f"Searching for '{pattern}'"
73
- ))
74
- yield from (doc for doc in results if doc is not None)
75
- else:
76
- for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
77
- if document.contains_string(pattern):
78
- yield document
90
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
91
+ results = executor.map(check_document, documents)
92
+
93
+ for doc in tqdm(results, total=len(documents), desc=f"Searching for '{pattern}'"):
94
+ if doc is not None:
95
+ yield doc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.428
3
+ Version: 0.430
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -24,8 +24,8 @@ Requires-Dist: pandas; extra == "dataset-builder"
24
24
  Requires-Dist: google-generativeai; extra == "dataset-builder"
25
25
  Requires-Dist: psutil; extra == "dataset-builder"
26
26
  Provides-Extra: all
27
- Requires-Dist: openai; extra == "all"
28
- Requires-Dist: pandas; extra == "all"
27
+ Requires-Dist: psutil; extra == "all"
29
28
  Requires-Dist: google-generativeai; extra == "all"
29
+ Requires-Dist: openai; extra == "all"
30
30
  Requires-Dist: flask; extra == "all"
31
- Requires-Dist: psutil; extra == "all"
31
+ Requires-Dist: pandas; extra == "all"
@@ -1,10 +1,10 @@
1
1
  datamule/__init__.py,sha256=qRl4jjMTyekdRV16KOyA4_DoqB-pBi8hMxaUDo88Vu8,2352
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
- datamule/document.py,sha256=GI2BbwSijRkzRp-qz94HzSkYTebGYKXXPwWMbqfepTM,5633
3
+ datamule/document.py,sha256=LIUaDUS-mBigCghnecEespNnFwKsVj6pwpZIyusMq-M,5715
4
4
  datamule/helper.py,sha256=tr3AQWus9dHNZFKpLSglWjcb8zmm5qDXjOWACMhvMxQ,4594
5
5
  datamule/monitor.py,sha256=mRaM8v5NgcMF9DJ1s_YBzucjrbr-3yFwW422MVml-_Q,9114
6
6
  datamule/packageupdater.py,sha256=vEGqlTj6FudIeVHBVJltPh2eBDEqMG9HYmnyrRVKeSU,9595
7
- datamule/portfolio.py,sha256=Iq-MmOIMdV1VpOcPnOgh7xk1hqUzWWmHoy2iUziLjow,3116
7
+ datamule/portfolio.py,sha256=CNGhNpk8JwL85CCX7RYheTevFB-eGB0_VK02Bo_t9Mc,3566
8
8
  datamule/submission.py,sha256=j-uIuwHIvRheXo5RcDf1GYVeYbdjq7yfoU1-GKTH2f8,2790
9
9
  datamule/data/company_former_names.csv,sha256=HE9cAv-_QKFX6jT-_-D0rHmaDyQuAzL4MJwank5O1U8,706380
10
10
  datamule/data/company_metadata.csv,sha256=yPovrCVjYwLWTU_hBUFJymp8iNO0NBYuq_QwOkRLoN8,3068599
@@ -13,7 +13,7 @@ datamule/data/sec-glossary.csv,sha256=-cN7GjiadLw5C1sv4zSeCnfeZZDYeSgJl-0ydarMAo
13
13
  datamule/data/xbrl_descriptions.csv,sha256=SQ9wUURNqG424rnTiZtopsxV2q-PvU4NMj52LqgDsvg,2621524
14
14
  datamule/dataset_builder/dataset_builder.py,sha256=NCvNbDwlEkA_eAbqbsG--YlqPBDREFTVSM1GJquR0RE,9747
15
15
  datamule/downloader/downloader.py,sha256=Cw8FAc0oi8H4okK6pVuS9hkAGoXR72fV3xAdF83J4v4,14558
16
- datamule/downloader/premiumdownloader.py,sha256=JH4aZ-ZwARCIACKwgzSgHAuOkKPc_GnhiUHSSu22XO4,14206
16
+ datamule/downloader/premiumdownloader.py,sha256=WyR0lHHva9txCYzf-OrZ2o-ScPxDBtjAfnB5ebUpzjk,14251
17
17
  datamule/mulebot/__init__.py,sha256=YvZXV6xQ0iP-oGD8rloufjdwJL6D46P3NNr0CY9PQCA,29
18
18
  datamule/mulebot/helper.py,sha256=olztOwltfELZ-IERM2bRNLBavD04kfB6ueWTisJAleA,1080
19
19
  datamule/mulebot/mulebot.py,sha256=XbtgvXBSFu9OaaLW_k1KDgHVTNQGV8_0ZwNMFad-pPU,5837
@@ -39,7 +39,7 @@ datamule/parser/document_parsing/basic_13g_parser.py,sha256=sWg83-QTAzUDNs45iWtp
39
39
  datamule/parser/document_parsing/basic_8k_parser.py,sha256=inCSmlH_BkLK0Lkvt0kZ6EUJ0nijul_RkdXzccyOmRI,2466
40
40
  datamule/parser/document_parsing/form_d_parser.py,sha256=dWlGeVZRzh0kfT3gVMC8eyqeQORdVV3r8KXUwEqAW3s,2036
41
41
  datamule/parser/document_parsing/generalized_item_parser.py,sha256=67_DFb1BQbMmdHefEgoCPlEoiUT0zyxh3eBNJpjGXUk,2616
42
- datamule/parser/document_parsing/helper.py,sha256=QPhVxLxMSx6Qdi7sR4D4iPObGoTnVD3tXTCNWzNxStg,2533
42
+ datamule/parser/document_parsing/helper.py,sha256=sIddTOeX3yjCXlJeW8_2-10NU9omDJrG9HyYdXisB1I,2540
43
43
  datamule/parser/document_parsing/information_table_parser_13fhr.py,sha256=R4Up1oDx3xAlzHwXzVzUkdOSsk8YPuJBPS_3I_bNQSE,1767
44
44
  datamule/parser/document_parsing/insider_trading_parser.py,sha256=OVQDeLcfaZtgmOWvWPDotftO6jxx-doFAqBYVqNgypo,7106
45
45
  datamule/parser/document_parsing/mappings.py,sha256=VKdnT3C5yPTbB4ZBa4El4jnB-6_osomm2rbJx6Ac6HE,5286
@@ -47,8 +47,8 @@ datamule/parser/document_parsing/n_port_p_parser.py,sha256=GmmQFkCZt57WikUZ5Daht
47
47
  datamule/parser/document_parsing/sec_parser.py,sha256=AS8H4h1sfUAdWP2gotULcjbylsYN_nHgTfkeVRyENPo,2716
48
48
  datamule/parser/document_parsing/sgml_parser.py,sha256=tC1cL3cdVQPWbc9QtoRUYSo2wRuYNaglFaCmP57oEfA,3317
49
49
  datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=P36bPkmVEmEwIhxumwJTmPslHqCaFcNhVSKPTQBQAH0,840575
50
- datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so,sha256=xcyJC5M1WSGr8l4eZeBffUGGUTAmXxGnPvj0eqD7AaM,395800
51
- datamule-0.428.dist-info/METADATA,sha256=PwVnD4CKpBieBe9hsnNhv85tVRdr7MGRyfiPMI6MPDU,1006
52
- datamule-0.428.dist-info/WHEEL,sha256=hckVD9W218I-cSckYATMe3Ko_37-6EaiE9IaoMBKclc,114
53
- datamule-0.428.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
54
- datamule-0.428.dist-info/RECORD,,
50
+ datamule/parser/sgml_parsing/sgml_parser_cy.cpython-310-darwin.so,sha256=rW_mVXXT1n-75AG_Dfs9QHGg-kFkiQ7srk_jBWe8cwQ,395800
51
+ datamule-0.430.dist-info/METADATA,sha256=mo_5k3kJja3arHNTKs5NzswM5Ixw1m1mxV1WarRerZo,1006
52
+ datamule-0.430.dist-info/WHEEL,sha256=SCc8MHJxw-p1LVYhHgn-sVJQOJeWCoPWUIXCO4nz594,114
53
+ datamule-0.430.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
54
+ datamule-0.430.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.7.0)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp310-cp310-macosx_10_9_universal2
5
5