datamule 2.1.2__tar.gz → 2.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {datamule-2.1.2 → datamule-2.1.4}/PKG-INFO +1 -1
  2. datamule-2.1.4/datamule/datasets.py +49 -0
  3. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/document.py +29 -8
  4. {datamule-2.1.2 → datamule-2.1.4}/datamule/portfolio.py +10 -6
  5. {datamule-2.1.2 → datamule-2.1.4}/datamule/submission.py +4 -4
  6. {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/PKG-INFO +1 -1
  7. {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/SOURCES.txt +1 -0
  8. {datamule-2.1.2 → datamule-2.1.4}/setup.py +1 -1
  9. {datamule-2.1.2 → datamule-2.1.4}/datamule/__init__.py +0 -0
  10. {datamule-2.1.2 → datamule-2.1.4}/datamule/config.py +0 -0
  11. {datamule-2.1.2 → datamule-2.1.4}/datamule/data/listed_filer_metadata.csv +0 -0
  12. {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/__init__.py +0 -0
  13. {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/datamule_lookup.py +0 -0
  14. {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/datamule_mysql_rds.py +0 -0
  15. {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/downloader.py +0 -0
  16. {datamule-2.1.2 → datamule-2.1.4}/datamule/datamule/sec_connector.py +0 -0
  17. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/__init__.py +0 -0
  18. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/__init__.py +0 -0
  19. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables.py +0 -0
  20. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_13fhr.py +0 -0
  21. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_25nse.py +0 -0
  22. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_informationtable.py +0 -0
  23. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_npx.py +0 -0
  24. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_ownership.py +0 -0
  25. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_proxyvotingrecord.py +0 -0
  26. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_sbsef.py +0 -0
  27. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/tables_sdr.py +0 -0
  28. {datamule-2.1.2 → datamule-2.1.4}/datamule/document/tables/utils.py +0 -0
  29. {datamule-2.1.2 → datamule-2.1.4}/datamule/helper.py +0 -0
  30. {datamule-2.1.2 → datamule-2.1.4}/datamule/index.py +0 -0
  31. {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/__init__.py +0 -0
  32. {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  33. {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  34. {datamule-2.1.2 → datamule-2.1.4}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  35. {datamule-2.1.2 → datamule-2.1.4}/datamule/package_updater.py +0 -0
  36. {datamule-2.1.2 → datamule-2.1.4}/datamule/portfolio_compression_utils.py +0 -0
  37. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/__init__.py +0 -0
  38. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/infrastructure/__init__.py +0 -0
  39. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  40. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/__init__.py +0 -0
  41. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/downloader.py +0 -0
  42. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/eftsquery.py +0 -0
  43. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/monitor.py +0 -0
  44. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/streamer.py +0 -0
  45. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/submissions/textsearch.py +0 -0
  46. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/utils.py +0 -0
  47. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/__init__.py +0 -0
  48. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  49. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  50. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  51. {datamule-2.1.2 → datamule-2.1.4}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  52. {datamule-2.1.2 → datamule-2.1.4}/datamule/seclibrary/__init__.py +0 -0
  53. {datamule-2.1.2 → datamule-2.1.4}/datamule/seclibrary/bq.py +0 -0
  54. {datamule-2.1.2 → datamule-2.1.4}/datamule/sheet.py +0 -0
  55. {datamule-2.1.2 → datamule-2.1.4}/datamule/utils/__init__.py +0 -0
  56. {datamule-2.1.2 → datamule-2.1.4}/datamule/utils/construct_submissions_data.py +0 -0
  57. {datamule-2.1.2 → datamule-2.1.4}/datamule/utils/format_accession.py +0 -0
  58. {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/dependency_links.txt +0 -0
  59. {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/requires.txt +0 -0
  60. {datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/top_level.txt +0 -0
  61. {datamule-2.1.2 → datamule-2.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.1.2
3
+ Version: 2.1.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -0,0 +1,49 @@
1
+ # datamule/datasets.py
2
+ from pathlib import Path
3
+ import requests
4
+ import gzip
5
+ import shutil
6
+ import csv
7
+
8
+ # Dataset URLs
9
+ DATASET_URLS = {
10
+ "cik_cusip_crosswalk": "https://github.com/john-friedman/datamule-data/raw/refs/heads/master/data/datasets/cik_cusip_crosswalk.csv.gz"
11
+ }
12
+
13
+ def update_dataset(name):
14
+ """Force update a dataset by re-downloading it."""
15
+ return _get_dataset(name, update=True)
16
+
17
+ def _get_dataset(name, update=False):
18
+ """Internal function to get dataset as list of dicts, downloading if necessary."""
19
+ if name not in DATASET_URLS:
20
+ raise ValueError(f"Unknown dataset: {name}")
21
+
22
+ url = DATASET_URLS[name]
23
+ data_dir = Path.home() / ".datamule" / "datasets"
24
+ file_path = data_dir / f"{name}.csv"
25
+
26
+ if not file_path.exists() or update:
27
+ print(f"Downloading {name}...")
28
+ data_dir.mkdir(parents=True, exist_ok=True)
29
+
30
+ response = requests.get(url, stream=True)
31
+ response.raise_for_status()
32
+
33
+ gz_path = file_path.with_suffix('.csv.gz')
34
+ with open(gz_path, 'wb') as f:
35
+ for chunk in response.iter_content(chunk_size=8192):
36
+ f.write(chunk)
37
+
38
+ with gzip.open(gz_path, 'rb') as f_in:
39
+ with open(file_path, 'wb') as f_out:
40
+ shutil.copyfileobj(f_in, f_out)
41
+
42
+ gz_path.unlink()
43
+
44
+ # Read CSV and return as list of dicts
45
+ with open(file_path, 'r') as f:
46
+ return list(csv.DictReader(f))
47
+
48
+ # Dataset available as list of dicts on import
49
+ cik_cusip_crosswalk = _get_dataset("cik_cusip_crosswalk")
@@ -12,6 +12,7 @@ from selectolax.parser import HTMLParser
12
12
  from pathlib import Path
13
13
  import webbrowser
14
14
  from secsgml.utils import bytes_to_str
15
+ import tempfile
15
16
 
16
17
  from .tables.tables import Tables
17
18
 
@@ -36,18 +37,19 @@ class Document:
36
37
  # this will be filled by parsed
37
38
  self._data = None
38
39
  self._tables = None
40
+ self._text = None
39
41
 
40
42
 
41
43
 
42
44
  #_load_text_content
43
45
  def _preprocess_txt_content(self):
44
- return self.content.translate(str.maketrans({
46
+ self._text = self.content.decode().translate(str.maketrans({
45
47
  '\xa0': ' ', '\u2003': ' ',
46
48
  '\u2018': "'", '\u2019': "'",
47
49
  '\u201c': '"', '\u201d': '"'
48
50
  }))
49
51
 
50
- # will deprecate this when we add html2dict
52
+ # needs work
51
53
  def _preprocess_html_content(self):
52
54
  parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
53
55
 
@@ -95,7 +97,7 @@ class Document:
95
97
  while '\n\n\n' in text:
96
98
  text = text.replace('\n\n\n', '\n\n')
97
99
 
98
- return text.translate(str.maketrans({
100
+ self._text = text.translate(str.maketrans({
99
101
  '\xa0': ' ', '\u2003': ' ',
100
102
  '\u2018': "'", '\u2019': "'",
101
103
  '\u201c': '"', '\u201d': '"'
@@ -116,7 +118,7 @@ class Document:
116
118
  mapping_dict = None
117
119
 
118
120
  if self.extension == '.txt':
119
- content = self._preprocess_txt_content()
121
+ content = self.text
120
122
  if self.type == '10-Q':
121
123
  mapping_dict = dict_10q
122
124
  elif self.type == '10-K':
@@ -224,6 +226,15 @@ class Document:
224
226
  self.parse()
225
227
  return self._data
226
228
 
229
+ @property
230
+ def text(self):
231
+ if self._text is None:
232
+ if self.extension in ['.htm','.html']:
233
+ self._preprocess_html_content()
234
+ elif self.extension == '.txt':
235
+ self._preprocess_txt_content()
236
+ return self._text
237
+
227
238
  def write_json(self, output_filename=None):
228
239
  if not self.data:
229
240
  self.parse()
@@ -308,13 +319,23 @@ class Document:
308
319
  self.parse()
309
320
 
310
321
  if not self.data:
311
- if self.extension in ['.jpg', '.png', '.pdf']:
312
- webbrowser.open('file://' + str(self.path))
313
- else:
314
- pass
322
+ pass
315
323
  else:
316
324
  visualize_dict(self.data)
317
325
 
326
+ # alpha feature
327
+ def open(self):
328
+ """Open the document. Experimental. Creates copy in temp, rather than use tar path for now."""
329
+ if self.extension in ['.htm', '.html','.txt','.jpg','.png', '.pdf']:
330
+ # Create a temporary file with the content and open it
331
+
332
+ with tempfile.NamedTemporaryFile(mode='wb', suffix=self.extension, delete=False) as f:
333
+ f.write(self.content)
334
+ temp_path = f.name
335
+ webbrowser.open('file://' + temp_path)
336
+ else:
337
+ print(f"Cannot open files with extension {self.extension}")
338
+
318
339
  def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
319
340
  if not self.data:
320
341
  self.parse()
@@ -96,12 +96,16 @@ class Portfolio:
96
96
  # Create submissions for each accession
97
97
  submissions = []
98
98
  for accession_prefix in accession_prefixes:
99
- submission = Submission(
100
- batch_tar_path=batch_tar_path,
101
- accession_prefix=accession_prefix,
102
- portfolio_ref=self
103
- )
104
- submissions.append(submission)
99
+ try:
100
+ submission = Submission(
101
+ batch_tar_path=batch_tar_path,
102
+ accession_prefix=accession_prefix,
103
+ portfolio_ref=self
104
+ )
105
+ submissions.append(submission)
106
+ except Exception as e:
107
+ pass
108
+ #print(f"Path: {batch_tar_path}. Exception: {e}")
105
109
  pbar.update(1) # Update progress for each successful submission
106
110
 
107
111
  return submissions
@@ -12,6 +12,7 @@ import urllib.request
12
12
  from secxbrl import parse_inline_xbrl
13
13
  from company_fundamentals import construct_fundamentals
14
14
  from decimal import Decimal
15
+ from .utils.format_accession import format_accession
15
16
 
16
17
 
17
18
  class Submission:
@@ -93,11 +94,10 @@ class Submission:
93
94
  # standardize metadata
94
95
  metadata = transform_metadata_string(metadata)
95
96
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
96
- self.accession = self.metadata.content['accession-number']
97
+
98
+ # lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
99
+ self.accession = format_accession(self.accession_prefix,'dash')
97
100
 
98
- # Band-aid fix: some SGML files in the SEC are bad lol, so they have TWO header sections. Will fix post w/ my cleaned archive
99
- if isinstance(self.accession,list):
100
- self.accession = self.accession[0]
101
101
  #print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
102
102
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
103
103
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.1.2
3
+ Version: 2.1.4
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,6 +1,7 @@
1
1
  setup.py
2
2
  datamule/__init__.py
3
3
  datamule/config.py
4
+ datamule/datasets.py
4
5
  datamule/helper.py
5
6
  datamule/index.py
6
7
  datamule/package_updater.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="2.1.2",
35
+ version="2.1.4",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes