datamule 1.2.0__py3-none-any.whl → 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/submission.py CHANGED
@@ -1,8 +1,10 @@
1
1
  from pathlib import Path
2
2
  import json
3
- from .document import Document
3
+ from .document.document import Document
4
4
  from secsgml import parse_sgml_submission_into_memory
5
- from pathlib import Path
5
+ import os
6
+ import aiofiles
7
+
6
8
 
7
9
  class Submission:
8
10
  def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -14,7 +16,13 @@ class Submission:
14
16
  if sgml_content is not None:
15
17
  self.path = None
16
18
  self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
19
+
20
+ # code dupe
21
+ self.accession = self.metadata['accession-number']
22
+ self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
23
+
17
24
  self.documents = []
25
+ filtered_metadata_documents = []
18
26
 
19
27
  for idx,doc in enumerate(self.metadata['documents']):
20
28
  type = doc.get('type')
@@ -24,16 +32,25 @@ class Submission:
24
32
  continue
25
33
  filename = doc.get('filename')
26
34
  extension = Path(filename).suffix
27
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension))
35
+ self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
28
36
 
37
+ filtered_metadata_documents.append(doc)
38
+
39
+ self.metadata['documents'] = filtered_metadata_documents
29
40
 
30
41
  if path is not None:
31
42
  self.path = Path(path)
32
43
  metadata_path = self.path / 'metadata.json'
33
44
  with metadata_path.open('r') as f:
34
45
  self.metadata = json.load(f)
46
+
47
+ # Code dupe
48
+ self.accession = self.metadata['accession-number']
49
+ self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
35
50
 
36
51
 
52
+
53
+
37
54
  def document_type(self, document_type):
38
55
  # Convert single document type to list for consistent handling
39
56
  if isinstance(document_type, str):
@@ -54,10 +71,13 @@ class Submission:
54
71
  document_path = self.path / filename
55
72
  extension = document_path.suffix
56
73
 
57
- with document_path.open('r') as f:
74
+ with document_path.open('rb') as f:
58
75
  content = f.read()
59
76
 
60
- yield Document(type=doc['type'], content=content, extension=extension)
77
+ if extension in ['.htm','.html','.txt','.xml']:
78
+ content = content.decode('utf-8', errors='replace')
79
+
80
+ yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
61
81
  # if loaded from sgml_content
62
82
  else:
63
83
  yield self.documents[idx]
@@ -78,10 +98,13 @@ class Submission:
78
98
 
79
99
  # check if the file exists
80
100
  if document_path.exists():
81
- with document_path.open('r') as f:
101
+ with document_path.open('rb') as f:
82
102
  content = f.read()
83
103
 
84
- yield Document(type=doc['type'], content=content, extension=extension)
104
+ if extension in ['.htm','.html','.txt','.xml']:
105
+ content = content.decode('utf-8', errors='replace')
106
+
107
+ yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
85
108
  else:
86
109
  print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
87
110
 
@@ -89,25 +112,87 @@ class Submission:
89
112
  else:
90
113
  yield self.documents[idx]
91
114
 
92
- # keep documents by document type
93
- def keep(self, document_type):
94
- # Convert single document type to list for consistent handling
95
- if isinstance(document_type, str):
96
- document_types = [document_type]
97
- else:
98
- document_types = document_type
99
115
 
100
- if self.path is not None:
101
- for doc in self.metadata['documents']:
116
+
117
+
118
+ def save(self, output_dir="filings"):
119
+ file_dir = Path(output_dir) / str(self.accession)
120
+ file_dir.mkdir(parents=True, exist_ok=True)
121
+
122
+ metadata_path = file_dir / "metadata.json"
123
+ with open(metadata_path, 'w') as f:
124
+ json.dump(self.metadata, f, indent=4)
125
+
126
+ for idx, doc in enumerate(self.metadata['documents']):
127
+ try:
102
128
  filename = doc.get('filename')
103
- type = doc.get('type')
104
- if type not in document_types:
105
- # oh we need handling here for sequences case
106
- if filename is None:
107
- filename = doc.sequence + '.txt'
129
+ if filename is None:
130
+ filename = f"{doc.get('sequence', idx)}.txt"
131
+ except (KeyError, IndexError):
132
+ filename = f"{idx}.txt"
133
+
134
+ doc_path = file_dir / filename
135
+
136
+ if self.path is not None:
137
+ if hasattr(self, 'documents') and self.documents:
138
+ content = self.documents[idx].content
139
+ else:
140
+ orig_doc_path = self.path / filename
141
+ if orig_doc_path.exists():
142
+ with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
143
+ content = f.read()
144
+ else:
145
+ print(f"Warning: File {orig_doc_path} does not exist, skipping.")
146
+ continue
147
+ else:
148
+ content = self.documents[idx].content
149
+
150
+ if isinstance(content, bytes):
151
+ with open(doc_path, 'wb') as f:
152
+ f.write(content)
153
+ else:
154
+ with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
155
+ f.write(content)
156
+
157
+ return file_dir
108
158
 
109
- document_path = self.path / filename
110
- # delete the file
111
- document_path.unlink()
112
- else:
113
- print("Warning: keep() method is only available when loading from path.")
159
+ async def save_async(self, output_dir="filings"):
160
+ file_dir = Path(output_dir) / str(self.accession)
161
+ os.makedirs(file_dir, exist_ok=True)
162
+
163
+ metadata_path = file_dir / "metadata.json"
164
+ async with aiofiles.open(metadata_path, 'w') as f:
165
+ await f.write(json.dumps(self.metadata, indent=4))
166
+
167
+ for idx, doc in enumerate(self.metadata['documents']):
168
+ try:
169
+ filename = doc.get('filename')
170
+ if filename is None:
171
+ filename = f"{doc.get('sequence', idx)}.txt"
172
+ except (KeyError, IndexError):
173
+ filename = f"{idx}.txt"
174
+
175
+ doc_path = file_dir / filename
176
+
177
+ if self.path is not None:
178
+ if hasattr(self, 'documents') and self.documents:
179
+ content = self.documents[idx].content
180
+ else:
181
+ orig_doc_path = self.path / filename
182
+ if orig_doc_path.exists():
183
+ async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
184
+ content = await f.read()
185
+ else:
186
+ print(f"Warning: File {orig_doc_path} does not exist, skipping.")
187
+ continue
188
+ else:
189
+ content = self.documents[idx].content
190
+
191
+ if isinstance(content, bytes):
192
+ async with aiofiles.open(doc_path, 'wb') as f:
193
+ await f.write(content)
194
+ else:
195
+ async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
196
+ await f.write(content)
197
+
198
+ return file_dir
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.2.0
3
+ Version: 1.2.3
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,11 +1,15 @@
1
- datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
1
+ datamule/__init__.py,sha256=8KioESb9y0Xwy72WuTfsYZnnMFdCrRhSv8DW-kZ4-To,1066
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
- datamule/document.py,sha256=CvuyazJ1qP8Ygpv49ikMc8DyGK7N-tApTU2Ccgv57q4,21556
4
3
  datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
5
4
  datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
6
- datamule/portfolio.py,sha256=yWt5gYTjV7rJsLiPUmhc6Vmr3lfvfCR5MSpLQ_6Gdp4,7104
5
+ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
+ datamule/portfolio.py,sha256=so6j2KrkcZOToHIqkANAu3CC4QsfgaUN1zk9CrbRe1E,7225
7
7
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
- datamule/submission.py,sha256=LI7Zr60YbE_tU-v2N09k2dGjfztSgplKZACT3eRUkFE,4463
8
+ datamule/submission.py,sha256=HXuFL6snLevGk7DGlvPbjcBOJuccAIxEPXnkA1TXX8Y,8121
9
+ datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ datamule/document/document.py,sha256=BRnHPVt-vIT7EZTF-c-Ulv3N33xX9zE02Q9mKXVDeuY,9474
11
+ datamule/document/processing.py,sha256=fw-1OWfbmZhG1R8XpJx_vcGwz3_djmk0FrblHAMPmwc,27476
12
+ datamule/document/table.py,sha256=Sv9jTGiVhnWIY9nHaynUUixwbCrvbLsf0fdOnFR-NCY,10791
9
13
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
14
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
11
15
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
@@ -16,7 +20,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
16
20
  datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
21
  datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
18
22
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- datamule/sec/submissions/downloader.py,sha256=IB08W8-lQD5Bb0LgzrTN4Xi4HsCw24DybRLHqE1AUrU,3290
23
+ datamule/sec/submissions/downloader.py,sha256=60wX2Yml1UCuxOtU0xMxqqeyHhrypCmlDQ0jZF-StJo,2665
20
24
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
21
25
  datamule/sec/submissions/monitor.py,sha256=Im2kgnUehhTgyY2Vq3uk07n4Vkj4PjII_SsRDi8ehAE,5384
22
26
  datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
@@ -28,9 +32,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=WyJIwuy5mNMXWpx_IkhFzDMe9MOfQ-vNk
28
32
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
29
33
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
34
  datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
31
- datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
35
+ datamule/seclibrary/downloader.py,sha256=fJztJ_sEfv2oHHbDff07DRlXLmztXnzt3Yvv5YaZgGk,13718
32
36
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
33
- datamule-1.2.0.dist-info/METADATA,sha256=IDVSWEibrVQWmrNKEXrD1oaucOpmP7Agr4f6bv6o3Kg,512
34
- datamule-1.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
35
- datamule-1.2.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
36
- datamule-1.2.0.dist-info/RECORD,,
37
+ datamule-1.2.3.dist-info/METADATA,sha256=3gODk6YjozgMTYnjvXRX_pox_Otkq7tSDZY2LEl6MiU,512
38
+ datamule-1.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
39
+ datamule-1.2.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
40
+ datamule-1.2.3.dist-info/RECORD,,