datamule 0.427__tar.gz → 0.428__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {datamule-0.427 → datamule-0.428}/PKG-INFO +1 -1
  2. {datamule-0.427 → datamule-0.428}/datamule/__init__.py +3 -0
  3. datamule-0.428/datamule/config.py +29 -0
  4. {datamule-0.427 → datamule-0.428}/datamule/document.py +16 -4
  5. datamule-0.428/datamule/portfolio.py +78 -0
  6. {datamule-0.427 → datamule-0.428}/datamule/submission.py +10 -1
  7. {datamule-0.427 → datamule-0.428}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-0.427 → datamule-0.428}/datamule.egg-info/SOURCES.txt +1 -0
  9. {datamule-0.427 → datamule-0.428}/setup.py +1 -1
  10. datamule-0.427/datamule/portfolio.py +0 -31
  11. {datamule-0.427 → datamule-0.428}/datamule/data/company_former_names.csv +0 -0
  12. {datamule-0.427 → datamule-0.428}/datamule/data/company_metadata.csv +0 -0
  13. {datamule-0.427 → datamule-0.428}/datamule/data/company_tickers.csv +0 -0
  14. {datamule-0.427 → datamule-0.428}/datamule/data/sec-glossary.csv +0 -0
  15. {datamule-0.427 → datamule-0.428}/datamule/data/xbrl_descriptions.csv +0 -0
  16. {datamule-0.427 → datamule-0.428}/datamule/dataset_builder/dataset_builder.py +0 -0
  17. {datamule-0.427 → datamule-0.428}/datamule/downloader/downloader.py +0 -0
  18. {datamule-0.427 → datamule-0.428}/datamule/downloader/premiumdownloader.py +0 -0
  19. {datamule-0.427 → datamule-0.428}/datamule/helper.py +0 -0
  20. {datamule-0.427 → datamule-0.428}/datamule/monitor.py +0 -0
  21. {datamule-0.427 → datamule-0.428}/datamule/mulebot/__init__.py +0 -0
  22. {datamule-0.427 → datamule-0.428}/datamule/mulebot/helper.py +0 -0
  23. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot.py +0 -0
  24. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/__init__.py +0 -0
  25. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/server.py +0 -0
  26. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -0
  27. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -0
  28. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -0
  29. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -0
  30. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -0
  31. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -0
  32. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -0
  33. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -0
  34. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -0
  35. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -0
  36. {datamule-0.427 → datamule-0.428}/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -0
  37. {datamule-0.427 → datamule-0.428}/datamule/mulebot/search.py +0 -0
  38. {datamule-0.427 → datamule-0.428}/datamule/mulebot/tools.py +0 -0
  39. {datamule-0.427 → datamule-0.428}/datamule/packageupdater.py +0 -0
  40. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/basic_10k_parser.py +0 -0
  41. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/basic_10q_parser.py +0 -0
  42. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/basic_13d_parser.py +0 -0
  43. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/basic_13g_parser.py +0 -0
  44. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/basic_8k_parser.py +0 -0
  45. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/form_d_parser.py +0 -0
  46. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/generalized_item_parser.py +0 -0
  47. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/helper.py +0 -0
  48. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/information_table_parser_13fhr.py +0 -0
  49. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/insider_trading_parser.py +0 -0
  50. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/mappings.py +0 -0
  51. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/n_port_p_parser.py +0 -0
  52. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/sec_parser.py +0 -0
  53. {datamule-0.427 → datamule-0.428}/datamule/parser/document_parsing/sgml_parser.py +0 -0
  54. {datamule-0.427 → datamule-0.428}/datamule/parser/sgml_parsing/sgml_parser_cy.c +0 -0
  55. {datamule-0.427 → datamule-0.428}/datamule.egg-info/dependency_links.txt +0 -0
  56. {datamule-0.427 → datamule-0.428}/datamule.egg-info/requires.txt +1 -1
  57. {datamule-0.427 → datamule-0.428}/datamule.egg-info/top_level.txt +0 -0
  58. {datamule-0.427 → datamule-0.428}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.427
3
+ Version: 0.428
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -39,6 +39,9 @@ def __getattr__(name):
39
39
  elif name == 'load_package_dataset':
40
40
  from .helper import load_package_dataset
41
41
  return load_package_dataset
42
+ elif name == 'Config':
43
+ from .config import Config
44
+ return Config
42
45
  raise AttributeError(f"module 'datamule' has no attribute '{name}'")
43
46
 
44
47
  # Lazy load nest_asyncio only when needed
@@ -0,0 +1,29 @@
1
+ import json
2
+ import os
3
+
4
+ class Config:
5
+ def __init__(self):
6
+ self.config_path = os.path.expanduser("~/.datamule/config.json")
7
+ self._ensure_config_exists()
8
+
9
+ def _ensure_config_exists(self):
10
+ os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
11
+ if not os.path.exists(self.config_path):
12
+ self._save_config({"default_source": None})
13
+
14
+ def _save_config(self, config):
15
+ with open(self.config_path, 'w') as f:
16
+ json.dump(config, f)
17
+
18
+ def set_default_source(self, source):
19
+ config = self._load_config()
20
+ config["default_source"] = source
21
+ self._save_config(config)
22
+
23
+ def get_default_source(self):
24
+ config = self._load_config()
25
+ return config.get("default_source")
26
+
27
+ def _load_config(self):
28
+ with open(self.config_path) as f:
29
+ return json.load(f)
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  import csv
3
3
  from .parser.document_parsing.sec_parser import Parser
4
+ from .parser.document_parsing.helper import load_file_content
4
5
  from .helper import convert_to_dashed_accession
6
+ import re
5
7
 
6
8
  # we need to modify parse filing to take option in memory
7
9
 
@@ -10,12 +12,22 @@ parser = Parser()
10
12
  class Document:
11
13
  def __init__(self, type, filename):
12
14
  self.type = type
13
- self.filename = filename
15
+ self.path = filename
14
16
 
15
17
  self.data = None
18
+ self.content = None
16
19
 
20
+ def contains_string(self, pattern):
21
+ """Currently only works for .htm, .html, and .txt files"""
22
+ if self.path.suffix in ['.htm', '.html', '.txt']:
23
+ if self.content is None:
24
+ self.content = load_file_content(self.path)
25
+ return bool(re.search(pattern, self.content))
26
+ return False
27
+
28
+ # Note: this method will be heavily modified in the future
17
29
  def parse(self):
18
- self.data = parser.parse_filing(self.filename, self.type)
30
+ self.data = parser.parse_filing(self.path, self.type)
19
31
  return self.data
20
32
 
21
33
  def write_json(self, output_filename=None):
@@ -23,7 +35,7 @@ class Document:
23
35
  raise ValueError("No data to write. Parse filing first.")
24
36
 
25
37
  if output_filename is None:
26
- output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
38
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
27
39
 
28
40
  with open(output_filename, 'w') as f:
29
41
  json.dump(self.data, f, indent=2)
@@ -33,7 +45,7 @@ class Document:
33
45
  raise ValueError("No data available. Please call parse_filing() first.")
34
46
 
35
47
  if output_filename is None:
36
- output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
48
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
37
49
 
38
50
  with open(output_filename, 'w', newline='') as csvfile:
39
51
  if not self.data:
@@ -0,0 +1,78 @@
1
+ from pathlib import Path
2
+ from tqdm import tqdm
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from .submission import Submission
5
+ from .downloader.premiumdownloader import PremiumDownloader
6
+ from .downloader.downloader import Downloader
7
+ from .config import Config
8
+
9
+ class Portfolio:
10
+ def create(cls, path):
11
+ # This method handles the process pool lifecycle
12
+ with ProcessPoolExecutor() as executor:
13
+ portfolio = cls(path, executor)
14
+ return portfolio
15
+
16
+ def __init__(self, path, executor=None):
17
+ self.path = Path(path)
18
+ # check if path exists
19
+ if self.path.exists():
20
+ folders = [f for f in self.path.iterdir() if f.is_dir()]
21
+ print(f"Loading {len(folders)} submissions")
22
+
23
+ if executor is None:
24
+ # Fall back to sequential loading if no executor
25
+ self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
26
+ else:
27
+ # Use provided executor for parallel loading
28
+ self.submissions = list(tqdm(
29
+ executor.map(Submission, folders),
30
+ total=len(folders),
31
+ desc="Loading submissions"
32
+ ))
33
+
34
+ else:
35
+ pass
36
+
37
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
38
+ if provider is None:
39
+ config = Config()
40
+ provider = config.get_default_source()
41
+
42
+ if provider == 'sec':
43
+ downloader = Downloader()
44
+ elif provider == 'datamule':
45
+ downloader = PremiumDownloader()
46
+
47
+ downloader.download_submissions(output_dir=self.path, cik=cik, ticker=ticker, submission_type=submission_type, filing_date=filing_date
48
+ )
49
+ def __iter__(self):
50
+ return iter(self.submissions)
51
+
52
+ def document_type(self, document_types):
53
+ # Convert single document type to list for consistent handling
54
+ if isinstance(document_types, str):
55
+ document_types = [document_types]
56
+
57
+ for submission in self.submissions:
58
+ yield from submission.document_type(document_types)
59
+
60
+ def contains_string(self, pattern, document_types=None, executor=None):
61
+ def check_document(document):
62
+ return document if document.contains_string(pattern) else None
63
+
64
+ documents = list(self.document_type(document_types) if document_types else (
65
+ doc for sub in tqdm(self.submissions, desc="Collecting documents") for doc in sub
66
+ ))
67
+
68
+ if executor:
69
+ results = list(tqdm(
70
+ executor.map(check_document, documents),
71
+ total=len(documents),
72
+ desc=f"Searching for '{pattern}'"
73
+ ))
74
+ yield from (doc for doc in results if doc is not None)
75
+ else:
76
+ for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
77
+ if document.contains_string(pattern):
78
+ yield document
@@ -64,4 +64,13 @@ class Submission:
64
64
  continue
65
65
 
66
66
  document_path = self.path / filename
67
- yield Document(doc['TYPE'], document_path)
67
+ yield Document(doc['TYPE'], document_path)
68
+
69
+ def __iter__(self):
70
+ for doc in self.metadata['documents']:
71
+ filename = doc.get('FILENAME')
72
+ if filename is None:
73
+ continue
74
+
75
+ document_path = self.path / filename
76
+ yield Document(doc['TYPE'], document_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.427
3
+ Version: 0.428
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,5 +1,6 @@
1
1
  setup.py
2
2
  datamule/__init__.py
3
+ datamule/config.py
3
4
  datamule/document.py
4
5
  datamule/helper.py
5
6
  datamule/monitor.py
@@ -55,7 +55,7 @@ extras["all"] = list(all_dependencies)
55
55
  setup(
56
56
  name="datamule",
57
57
  author="John Friedman",
58
- version="0.427",
58
+ version="0.428",
59
59
  description="Making it easier to use SEC filings.",
60
60
  packages=find_namespace_packages(include=['datamule*']),
61
61
  url="https://github.com/john-friedman/datamule-python",
@@ -1,31 +0,0 @@
1
- from pathlib import Path
2
- from tqdm import tqdm
3
- from concurrent.futures import ProcessPoolExecutor
4
- from .submission import Submission
5
-
6
- class Portfolio:
7
- @classmethod
8
- def create(cls, path):
9
- # This method handles the process pool lifecycle
10
- with ProcessPoolExecutor() as executor:
11
- portfolio = cls(path, executor)
12
- return portfolio
13
-
14
- def __init__(self, path, executor=None):
15
- self.path = Path(path)
16
- folders = [f for f in self.path.iterdir() if f.is_dir()]
17
- print(f"Loading {len(folders)} submissions")
18
-
19
- if executor is None:
20
- # Fall back to sequential loading if no executor
21
- self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
22
- else:
23
- # Use provided executor for parallel loading
24
- self.submissions = list(tqdm(
25
- executor.map(Submission, folders),
26
- total=len(folders),
27
- desc="Loading submissions"
28
- ))
29
-
30
- def __iter__(self):
31
- return iter(self.submissions)
File without changes
File without changes
@@ -11,10 +11,10 @@ pytz
11
11
  zstandard
12
12
 
13
13
  [all]
14
+ openai
14
15
  flask
15
16
  pandas
16
17
  psutil
17
- openai
18
18
  google-generativeai
19
19
 
20
20
  [dataset_builder]
File without changes