datamule 1.1.8__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/__init__.py CHANGED
@@ -1,10 +1,11 @@
1
1
  from .submission import Submission
2
2
  from .portfolio import Portfolio
3
- from .document import Document
3
+ from .document.document import Document
4
4
  from .helper import _load_package_csv, load_package_dataset
5
5
  from .config import Config
6
6
  from .sheet import Sheet
7
7
  from .index import Index
8
+ from .package_updater import PackageUpdater
8
9
 
9
10
 
10
11
  # Keep the notebook environment setup
@@ -37,4 +38,5 @@ __all__ = [
37
38
  'Config',
38
39
  'Sheet',
39
40
  'Index',
41
+ 'PackageUpdater',
40
42
  ]
File without changes
@@ -0,0 +1,255 @@
1
+ import json
2
+ import csv
3
+ import re
4
+ from doc2dict import xml2dict, txt2dict, dict2dict
5
+ from doc2dict.mapping import flatten_hierarchy
6
+ from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
7
+ from ..mapping_dicts.xml_mapping_dicts import dict_345
8
+ from selectolax.parser import HTMLParser
9
+ from .processing import process_tabular_data
10
+ from pathlib import Path
11
+
12
+ class Document:
13
+ def __init__(self, type, content, extension,accession,filing_date,path=None):
14
+
15
+ self.type = type
16
+ extension = extension.lower()
17
+ self.accession = accession
18
+ self.filing_date = filing_date
19
+ self.content = content
20
+
21
+ if path is not None:
22
+ self.path = path
23
+
24
+ self.extension = extension
25
+ # this will be filled by parsed
26
+ self.data = None
27
+
28
+ #_load_text_content
29
+ def _preprocess_txt_content(self):
30
+ return self.content.translate(str.maketrans({
31
+ '\xa0': ' ', '\u2003': ' ',
32
+ '\u2018': "'", '\u2019': "'",
33
+ '\u201c': '"', '\u201d': '"'
34
+ }))
35
+
36
+ # will deprecate this when we add html2dict
37
+ def _preprocess_html_content(self):
38
+ parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
39
+
40
+ # Remove hidden elements first
41
+ hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
42
+ for node in hidden_nodes:
43
+ node.decompose()
44
+
45
+ blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
46
+ lines = []
47
+ current_line = []
48
+
49
+ def flush_line():
50
+ if current_line:
51
+ # Don't add spaces between adjacent spans
52
+ lines.append(''.join(current_line))
53
+ current_line.clear()
54
+
55
+ for node in parser.root.traverse(include_text=True):
56
+ if node.tag in ('script', 'style', 'css'):
57
+ continue
58
+
59
+ if node.tag in blocks:
60
+ flush_line()
61
+ lines.append('')
62
+
63
+ if node.text_content:
64
+ text = node.text_content.strip()
65
+ if text:
66
+ if node.tag in blocks:
67
+ flush_line()
68
+ lines.append(text)
69
+ lines.append('')
70
+ else:
71
+ # Only add space if nodes aren't directly adjacent
72
+ if current_line and not current_line[-1].endswith(' '):
73
+ if node.prev and node.prev.text_content:
74
+ if node.parent != node.prev.parent or node.prev.next != node:
75
+ current_line.append(' ')
76
+ current_line.append(text)
77
+
78
+ flush_line()
79
+
80
+ text = '\n'.join(lines)
81
+ while '\n\n\n' in text:
82
+ text = text.replace('\n\n\n', '\n\n')
83
+
84
+ return text.translate(str.maketrans({
85
+ '\xa0': ' ', '\u2003': ' ',
86
+ '\u2018': "'", '\u2019': "'",
87
+ '\u201c': '"', '\u201d': '"'
88
+ }))
89
+
90
+ def contains_string(self, pattern):
91
+ """Works for select files"""
92
+ if self.extension in ['.htm', '.html', '.txt','.xml']:
93
+ return bool(re.search(pattern, self.content))
94
+ return False
95
+
96
+ # Note: this method will be heavily modified in the future
97
+ def parse(self):
98
+ # check if we have already parsed the content
99
+ if self.data:
100
+ return self.data
101
+
102
+ # preprocess content
103
+ if self.extension == '.txt':
104
+ self.content = self._preprocess_txt_content()
105
+ elif self.extension in ['.htm', '.html']:
106
+ self.content = self._preprocess_html_content()
107
+
108
+ mapping_dict = None
109
+
110
+ if self.extension == '.xml':
111
+ if self.type in ['3', '4', '5', '3/A', '4/A', '5/A']:
112
+ mapping_dict = dict_345
113
+
114
+ self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
115
+
116
+
117
+
118
+ # will deprecate this when we add html2dict
119
+ elif self.extension in ['.htm', '.html','.txt']:
120
+
121
+ if self.type == '10-K':
122
+ mapping_dict = dict_10k
123
+ elif self.type == '10-Q':
124
+ mapping_dict = dict_10q
125
+ elif self.type == '8-K':
126
+ mapping_dict = dict_8k
127
+ elif self.type == 'SC 13D':
128
+ mapping_dict = dict_13d
129
+ elif self.type == 'SC 13G':
130
+ mapping_dict = dict_13g
131
+
132
+ self.data = {}
133
+ self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
134
+ return self.data
135
+
136
+ def write_json(self, output_filename=None):
137
+ if not self.data:
138
+ self.parse()
139
+
140
+ with open(output_filename, 'w',encoding='utf-8') as f:
141
+ json.dump(self.data, f, indent=2)
142
+
143
+ def to_tabular(self):
144
+ if self.extension != '.xml':
145
+ return []
146
+ self.parse()
147
+ return process_tabular_data(self)
148
+
149
+
150
+ def write_csv(self, output_folder, accession_number=None):
151
+
152
+ tables = self.to_tabular(accession_number)
153
+
154
+ if not tables:
155
+ return
156
+
157
+ for table in tables:
158
+ fieldnames = table.columns
159
+ output_filename = Path(output_folder) / f"{table.type}.csv"
160
+
161
+ with open(output_filename, 'w', newline='') as csvfile:
162
+ writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
163
+ writer.writeheader()
164
+ writer.writerows(table.data)
165
+
166
+
167
+ def _document_to_section_text(self, document_data, parent_key=''):
168
+ items = []
169
+
170
+ if isinstance(document_data, dict):
171
+ for key, value in document_data.items():
172
+ # Build the section name
173
+ section = f"{parent_key}_{key}" if parent_key else key
174
+
175
+ # If the value is a dict, recurse
176
+ if isinstance(value, dict):
177
+ items.extend(self._document_to_section_text(value, section))
178
+ # If it's a list, handle each item
179
+ elif isinstance(value, list):
180
+ for i, item in enumerate(value):
181
+ if isinstance(item, dict):
182
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
183
+ else:
184
+ items.append({
185
+ 'section': f"{section}_{i+1}",
186
+ 'text': str(item)
187
+ })
188
+ # Base case - add the item
189
+ else:
190
+ items.append({
191
+ 'section': section,
192
+ 'text': str(value)
193
+ })
194
+
195
+ return items
196
+
197
+ # this will all have to be changed. default will be to flatten everything
198
+ # candidate for deletion
199
+ def __iter__(self):
200
+ self.parse()
201
+
202
+ # Let's remove XML iterable for now
203
+
204
+ # Handle text-based documents
205
+ if self.extension in ['.txt', '.htm', '.html']:
206
+ document_data = self.data
207
+ if not document_data:
208
+ return iter([])
209
+
210
+ # Find highest hierarchy level from mapping dict
211
+ highest_hierarchy = float('inf')
212
+ section_type = None
213
+
214
+ if self.type in ['10-K', '10-Q']:
215
+ mapping_dict = dict_10k if self.type == '10-K' else dict_10q
216
+ elif self.type == '8-K':
217
+ mapping_dict = dict_8k
218
+ elif self.type == 'SC 13D':
219
+ mapping_dict = dict_13d
220
+ elif self.type == 'SC 13G':
221
+ mapping_dict = dict_13g
222
+ else:
223
+ return iter([])
224
+
225
+ # Find section type with highest hierarchy number
226
+ highest_hierarchy = -1 # Start at -1 to find highest
227
+ for mapping in mapping_dict['rules']['mappings']:
228
+ if mapping.get('hierarchy') is not None:
229
+ if mapping['hierarchy'] > highest_hierarchy:
230
+ highest_hierarchy = mapping['hierarchy']
231
+ section_type = mapping['name']
232
+
233
+ if not section_type:
234
+ return iter([])
235
+
236
+ # Extract sections of the identified type
237
+ def find_sections(data, target_type):
238
+ sections = []
239
+ if isinstance(data, dict):
240
+ if data.get('type') == target_type:
241
+ sections.append({
242
+ 'item': data.get('text', ''),
243
+ 'text': flatten_hierarchy(data.get('content', []))
244
+ })
245
+ for value in data.values():
246
+ if isinstance(value, (dict, list)):
247
+ sections.extend(find_sections(value, target_type))
248
+ elif isinstance(data, list):
249
+ for item in data:
250
+ sections.extend(find_sections(item, target_type))
251
+ return sections
252
+
253
+ return iter(find_sections(document_data, section_type))
254
+
255
+ return iter([])