datamule 1.6.3__py3-none-any.whl → 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,6 +65,7 @@ def clean_efts_hits(hits):
65
65
  class Monitor():
66
66
  def __init__(self):
67
67
  self.accessions = deque(maxlen=50000)
68
+ self.accessions_set = set()
68
69
  self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
69
70
  self.efts_query = EFTSQuery(quiet=True)
70
71
  self.efts_query.limiter = self.ratelimiters['sec.gov']
@@ -250,7 +251,14 @@ class Monitor():
250
251
  new_items = []
251
252
  for item in items:
252
253
  accession = item['accession']
253
- if accession not in self.accessions:
254
+ if accession not in self.accessions_set: # O(1) lookup instead of O(n)
255
+ # Check if deque is at capacity and will evict an item
256
+ if len(self.accessions) == self.accessions.maxlen:
257
+ evicted_accession = self.accessions[0] # Get leftmost item that will be evicted
258
+ self.accessions_set.discard(evicted_accession) # Remove from set
259
+
260
+ # Add to both structures
254
261
  self.accessions.append(accession)
262
+ self.accessions_set.add(accession)
255
263
  new_items.append(item)
256
264
  return new_items
datamule/submission.py CHANGED
@@ -144,24 +144,57 @@ class Submission:
144
144
 
145
145
  # Handle regular path case
146
146
  else:
147
- # Use exact filename from metadata
148
- document_path = self.path / filename
149
-
150
- if not document_path.exists():
151
- raise FileNotFoundError(f"Document file not found: {document_path}")
152
-
153
- with document_path.open('rb') as f:
154
- content = f.read()
155
-
156
- # Decompress if needed based on filename extension
157
- if is_compressed == 'gzip':
158
- content = gzip.decompress(content)
159
- elif is_compressed == 'zstd':
160
- content = zstd.ZstdDecompressor().decompress(content)
147
+ # Check if path is a tar file (old format)
148
+ if self.path.suffix == '.tar':
149
+ with tarfile.open(self.path, 'r') as tar:
150
+ # Try to extract the file, handling compression
151
+ try:
152
+ content = tar.extractfile(filename).read()
153
+ actual_filename = filename
154
+ except:
155
+ try:
156
+ content = tar.extractfile(filename + '.gz').read()
157
+ actual_filename = filename + '.gz'
158
+ is_compressed = 'gzip'
159
+ except:
160
+ try:
161
+ content = tar.extractfile(filename + '.zst').read()
162
+ actual_filename = filename + '.zst'
163
+ is_compressed = 'zstd'
164
+ except:
165
+ raise FileNotFoundError(f"Document file not found in tar: {filename}")
166
+
167
+ # Decompress if compressed
168
+ if is_compressed == 'gzip':
169
+ content = gzip.decompress(content)
170
+ elif is_compressed == 'zstd':
171
+ content = zstd.ZstdDecompressor().decompress(content)
172
+
173
+ # Decode text files
174
+ if extension in ['.htm', '.html', '.txt', '.xml']:
175
+ content = content.decode('utf-8', errors='replace')
176
+
177
+ document_path = f"{self.path}::{actual_filename}"
161
178
 
162
- # Decode text files
163
- if extension in ['.htm', '.html', '.txt', '.xml']:
164
- content = content.decode('utf-8', errors='replace')
179
+ else:
180
+ # Regular directory case
181
+ document_path = self.path / filename
182
+
183
+ if not document_path.exists():
184
+ raise FileNotFoundError(f"Document file not found: {document_path}")
185
+
186
+ with document_path.open('rb') as f:
187
+ content = f.read()
188
+
189
+ # Decompress if needed based on filename extension
190
+ if is_compressed == 'gzip':
191
+ content = gzip.decompress(content)
192
+ elif is_compressed == 'zstd':
193
+ content = zstd.ZstdDecompressor().decompress(content)
194
+
195
+ # Decode text files
196
+ if extension in ['.htm', '.html', '.txt', '.xml']:
197
+ content = content.decode('utf-8', errors='replace')
165
198
 
166
199
  return Document(
167
200
  type=doc['type'],
@@ -171,7 +204,6 @@ class Submission:
171
204
  accession=self.accession,
172
205
  path=document_path
173
206
  )
174
-
175
207
  def __iter__(self):
176
208
  """Make Submission iterable by yielding all documents."""
177
209
  for idx in range(len(self.metadata.content['documents'])):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.3
3
+ Version: 1.6.5
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -6,7 +6,7 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
6
6
  datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
7
7
  datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
8
8
  datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
9
- datamule/submission.py,sha256=yDPglaFJ65nXn7Lxh-JFTQGKVVmBJDHBVWTf4UEUm2M,8610
9
+ datamule/submission.py,sha256=ooLsesZ5HkgSWyEFID4u08CobTxdo35eAUHSCB6fw2k,10332
10
10
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
11
11
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
@@ -52,7 +52,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
52
52
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
54
54
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
55
- datamule/sec/submissions/monitor.py,sha256=1JUMRYsTqtd31hX3UrUA_aXFUmZN6n-V7h0i1gavNOs,11395
55
+ datamule/sec/submissions/monitor.py,sha256=6mE0NZFdPId69t4V53GwBb9sqtRN7HE54sU3WpU0bnY,11900
56
56
  datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
57
57
  datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
58
58
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -66,7 +66,7 @@ datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOa
66
66
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
67
67
  datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
68
  datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
69
- datamule-1.6.3.dist-info/METADATA,sha256=9tb_ecnMVFHYq-Jcj_O0xAYUtM6v2PEZRxdEtPnorD4,524
70
- datamule-1.6.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
71
- datamule-1.6.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
72
- datamule-1.6.3.dist-info/RECORD,,
69
+ datamule-1.6.5.dist-info/METADATA,sha256=2Svo4CVhWzVDa9HFLOOwzIzgF84ItLQT-2BI6WR894s,524
70
+ datamule-1.6.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
71
+ datamule-1.6.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
72
+ datamule-1.6.5.dist-info/RECORD,,