datamule 1.3.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {datamule-1.3.0 → datamule-1.4.0}/PKG-INFO +1 -1
  2. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/document.py +49 -21
  3. {datamule-1.3.0 → datamule-1.4.0}/datamule/helper.py +10 -21
  4. datamule-1.4.0/datamule/mapping_dicts/html_mapping_dicts.py +11 -0
  5. {datamule-1.3.0 → datamule-1.4.0}/datamule.egg-info/PKG-INFO +1 -1
  6. {datamule-1.3.0 → datamule-1.4.0}/datamule.egg-info/SOURCES.txt +1 -0
  7. {datamule-1.3.0 → datamule-1.4.0}/setup.py +1 -1
  8. {datamule-1.3.0 → datamule-1.4.0}/datamule/__init__.py +0 -0
  9. {datamule-1.3.0 → datamule-1.4.0}/datamule/config.py +0 -0
  10. {datamule-1.3.0 → datamule-1.4.0}/datamule/data/listed_filer_metadata.csv +0 -0
  11. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/__init__.py +0 -0
  12. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/__init__.py +0 -0
  13. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/atsn.py +0 -0
  14. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/cfportal.py +0 -0
  15. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/d.py +0 -0
  16. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ex102_abs.py +0 -0
  17. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ex99a_sdr.py +0 -0
  18. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ex99c_sdr.py +0 -0
  19. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ex99g_sdr.py +0 -0
  20. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ex99i_sdr.py +0 -0
  21. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/information_table.py +0 -0
  22. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/nmfp.py +0 -0
  23. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/npx.py +0 -0
  24. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/onefourtyfour.py +0 -0
  25. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ownership.py +0 -0
  26. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/proxy_voting_record.py +0 -0
  27. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/sbs.py +0 -0
  28. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/sbsef.py +0 -0
  29. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/schedule13.py +0 -0
  30. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/sdr.py +0 -0
  31. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/submission_metadata.py +0 -0
  32. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/ta.py +0 -0
  33. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/thirteenfhr.py +0 -0
  34. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/twentyfivense.py +0 -0
  35. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  36. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/processing.py +0 -0
  37. {datamule-1.3.0 → datamule-1.4.0}/datamule/document/table.py +0 -0
  38. {datamule-1.3.0 → datamule-1.4.0}/datamule/index.py +0 -0
  39. {datamule-1.3.0 → datamule-1.4.0}/datamule/mapping_dicts/__init__.py +0 -0
  40. {datamule-1.3.0 → datamule-1.4.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  41. {datamule-1.3.0 → datamule-1.4.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  42. {datamule-1.3.0 → datamule-1.4.0}/datamule/package_updater.py +0 -0
  43. {datamule-1.3.0 → datamule-1.4.0}/datamule/portfolio.py +0 -0
  44. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/__init__.py +0 -0
  45. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/infrastructure/__init__.py +0 -0
  46. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  47. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/submissions/__init__.py +0 -0
  48. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/submissions/downloader.py +0 -0
  49. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/submissions/eftsquery.py +0 -0
  50. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/submissions/monitor.py +0 -0
  51. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/submissions/streamer.py +0 -0
  52. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/submissions/textsearch.py +0 -0
  53. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/utils.py +0 -0
  54. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/xbrl/__init__.py +0 -0
  55. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  56. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  57. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  58. {datamule-1.3.0 → datamule-1.4.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  59. {datamule-1.3.0 → datamule-1.4.0}/datamule/seclibrary/__init__.py +0 -0
  60. {datamule-1.3.0 → datamule-1.4.0}/datamule/seclibrary/bq.py +0 -0
  61. {datamule-1.3.0 → datamule-1.4.0}/datamule/seclibrary/downloader.py +0 -0
  62. {datamule-1.3.0 → datamule-1.4.0}/datamule/seclibrary/query.py +0 -0
  63. {datamule-1.3.0 → datamule-1.4.0}/datamule/sheet.py +0 -0
  64. {datamule-1.3.0 → datamule-1.4.0}/datamule/submission.py +0 -0
  65. {datamule-1.3.0 → datamule-1.4.0}/datamule.egg-info/dependency_links.txt +0 -0
  66. {datamule-1.3.0 → datamule-1.4.0}/datamule.egg-info/requires.txt +0 -0
  67. {datamule-1.3.0 → datamule-1.4.0}/datamule.egg-info/top_level.txt +0 -0
  68. {datamule-1.3.0 → datamule-1.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -3,11 +3,14 @@ import csv
3
3
  import re
4
4
  from doc2dict import xml2dict, txt2dict, dict2dict
5
5
  from doc2dict.mapping import flatten_hierarchy
6
+ from doc2dict import html2dict, visualize_dict, get_title, unnest_dict
6
7
  from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
7
8
  from ..mapping_dicts.xml_mapping_dicts import dict_345
9
+ from ..mapping_dicts.html_mapping_dicts import dict_10k_html, dict_10q_html, dict_8k_html
8
10
  from selectolax.parser import HTMLParser
9
11
  from .processing import process_tabular_data
10
12
  from pathlib import Path
13
+ import webbrowser
11
14
 
12
15
  class Document:
13
16
  def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -99,26 +102,10 @@ class Document:
99
102
  if self.data:
100
103
  return self.data
101
104
 
102
- # preprocess content
103
- if self.extension == '.txt':
104
- self.content = self._preprocess_txt_content()
105
- elif self.extension in ['.htm', '.html']:
106
- self.content = self._preprocess_html_content()
107
-
108
105
  mapping_dict = None
109
-
110
- if self.extension == '.xml':
111
- if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
112
- mapping_dict = dict_345
113
-
114
- self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
115
-
116
-
117
-
118
- # will deprecate this when we add html2dict
119
- elif self.extension in ['.htm', '.html','.txt']:
120
-
121
-
106
+
107
+ if self.extension == '.txt':
108
+ content = self._preprocess_txt_content()
122
109
  if self.type == '10-Q':
123
110
  mapping_dict = dict_10q
124
111
  elif self.type == '10-K':
@@ -131,8 +118,24 @@ class Document:
131
118
  mapping_dict = dict_13g
132
119
 
133
120
  self.data = {}
134
- self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
135
- return self.data
121
+ self.data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
122
+ elif self.extension in ['.htm', '.html']:
123
+ if self.type == '10-K':
124
+ mapping_dict = dict_10k_html
125
+ elif self.type == '10-Q':
126
+ mapping_dict = dict_10q_html
127
+ elif self.type == '8-K':
128
+ mapping_dict = dict_8k_html
129
+
130
+ dct = html2dict(content=self.content, mapping_dict=mapping_dict)
131
+ self.data = dct
132
+ elif self.extension == '.xml':
133
+ if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
134
+ mapping_dict = dict_345
135
+
136
+ self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
137
+ else:
138
+ pass
136
139
 
137
140
  def write_json(self, output_filename=None):
138
141
  if not self.data:
@@ -206,6 +209,31 @@ class Document:
206
209
  })
207
210
 
208
211
  return items
212
+
213
+ def visualize(self):
214
+ if not self.data:
215
+ self.parse()
216
+
217
+ if not self.data:
218
+ if self.extension in ['.jpg', '.png', '.pdf']:
219
+ webbrowser.open('file://' + str(self.path))
220
+ else:
221
+ pass
222
+ else:
223
+ visualize_dict(self.data)
224
+
225
+ def get_section(self, title, format='dict'):
226
+ if not self.data:
227
+ self.parse()
228
+
229
+ result = get_title(self.data,title)
230
+
231
+ if format == 'text':
232
+ result = [item[1] for item in result]
233
+ result = [unnest_dict(item) for item in result]
234
+
235
+ return result
236
+
209
237
 
210
238
  # this will all have to be changed. default will be to flatten everything
211
239
  # candidate for deletion
@@ -5,28 +5,17 @@ import os
5
5
 
6
6
  def _load_package_csv(name):
7
7
  """Load CSV files from package data directory"""
8
- # First try to load from the package data directory
9
- try:
10
- package_dir = os.path.dirname(os.path.dirname(__file__))
11
- csv_path = os.path.join(package_dir, "data", f"{name}.csv")
12
-
13
- # Fallback to the legacy location
14
- if not os.path.exists(csv_path):
15
- csv_path = Path.home() / ".datamule" / f"{name}.csv"
16
-
17
- data = []
18
- with open(csv_path, 'r') as csvfile:
19
- csv_reader = csv.DictReader(csvfile)
20
- for row in csv_reader:
21
- data.append(row)
22
-
23
- return data
8
+ package_dir = os.path.dirname(os.path.dirname(__file__))
9
+ csv_path = os.path.join(package_dir,"datamule", "data", f"{name}.csv")
10
+
11
+ data = []
12
+ with open(csv_path, 'r') as csvfile:
13
+ csv_reader = csv.DictReader(csvfile)
14
+ for row in csv_reader:
15
+ data.append(row)
16
+
17
+ return data
24
18
 
25
- except FileNotFoundError:
26
- raise FileNotFoundError(
27
- f"Required data file '{name}.csv' not found. "
28
- f"This file should be in the datamule package directory or in ~/.datamule/"
29
- )
30
19
 
31
20
  def load_package_dataset(dataset):
32
21
  if dataset =='listed_filer_metadata':
@@ -0,0 +1,11 @@
1
+ dict_10k_html = {
2
+ ('part',r'^part\s*([ivx]+)$') : 0,
3
+ ('signatures',r'^signatures?\.*$') : 0,
4
+ ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
5
+ }
6
+ dict_10q_html = dict_10k_html
7
+
8
+ dict_8k_html = {
9
+ ('signatures',r'^signatures?\.*$') : 0,
10
+ ('item',r'^item\s*(\d+\.\d+)') : 0,
11
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -42,6 +42,7 @@ datamule/document/mappings/thirteenfhr.py
42
42
  datamule/document/mappings/twentyfivense.py
43
43
  datamule/document/mappings/twentyfourf2nt.py
44
44
  datamule/mapping_dicts/__init__.py
45
+ datamule/mapping_dicts/html_mapping_dicts.py
45
46
  datamule/mapping_dicts/txt_mapping_dicts.py
46
47
  datamule/mapping_dicts/xml_mapping_dicts.py
47
48
  datamule/sec/__init__.py
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
32
32
  setup(
33
33
  name="datamule",
34
34
  author="John Friedman",
35
- version="1.3.0",
35
+ version="1.4.0",
36
36
  description="Work with SEC submissions at scale.",
37
37
  packages=find_packages(include=['datamule', 'datamule.*']),
38
38
  url="https://github.com/john-friedman/datamule-python",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes