datamule 0.429__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {datamule-0.429 → datamule-1.0.0}/PKG-INFO +1 -2
  2. datamule-1.0.0/datamule/__init__.py +47 -0
  3. datamule-1.0.0/datamule/book.py +16 -0
  4. datamule-1.0.0/datamule/document.py +278 -0
  5. {datamule-0.429 → datamule-1.0.0}/datamule/downloader/downloader.py +18 -8
  6. {datamule-0.429 → datamule-1.0.0}/datamule/downloader/premiumdownloader.py +8 -5
  7. datamule-1.0.0/datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  8. datamule-1.0.0/datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  9. {datamule-0.429 → datamule-1.0.0}/datamule/monitor.py +3 -1
  10. datamule-1.0.0/datamule/portfolio.py +106 -0
  11. {datamule-0.429 → datamule-1.0.0}/datamule/submission.py +9 -9
  12. {datamule-0.429 → datamule-1.0.0}/datamule.egg-info/PKG-INFO +1 -2
  13. {datamule-0.429 → datamule-1.0.0}/datamule.egg-info/SOURCES.txt +4 -17
  14. {datamule-0.429 → datamule-1.0.0}/datamule.egg-info/requires.txt +3 -9
  15. datamule-1.0.0/setup.py +47 -0
  16. datamule-0.429/datamule/__init__.py +0 -74
  17. datamule-0.429/datamule/dataset_builder/dataset_builder.py +0 -259
  18. datamule-0.429/datamule/document.py +0 -142
  19. datamule-0.429/datamule/parser/document_parsing/basic_10k_parser.py +0 -82
  20. datamule-0.429/datamule/parser/document_parsing/basic_10q_parser.py +0 -73
  21. datamule-0.429/datamule/parser/document_parsing/basic_13d_parser.py +0 -58
  22. datamule-0.429/datamule/parser/document_parsing/basic_13g_parser.py +0 -61
  23. datamule-0.429/datamule/parser/document_parsing/basic_8k_parser.py +0 -84
  24. datamule-0.429/datamule/parser/document_parsing/form_d_parser.py +0 -70
  25. datamule-0.429/datamule/parser/document_parsing/generalized_item_parser.py +0 -78
  26. datamule-0.429/datamule/parser/document_parsing/helper.py +0 -75
  27. datamule-0.429/datamule/parser/document_parsing/information_table_parser_13fhr.py +0 -41
  28. datamule-0.429/datamule/parser/document_parsing/insider_trading_parser.py +0 -158
  29. datamule-0.429/datamule/parser/document_parsing/mappings.py +0 -95
  30. datamule-0.429/datamule/parser/document_parsing/n_port_p_parser.py +0 -70
  31. datamule-0.429/datamule/parser/document_parsing/sec_parser.py +0 -73
  32. datamule-0.429/datamule/parser/document_parsing/sgml_parser.py +0 -94
  33. datamule-0.429/datamule/parser/sgml_parsing/sgml_parser_cy.c +0 -20006
  34. datamule-0.429/datamule/portfolio.py +0 -82
  35. datamule-0.429/setup.py +0 -93
  36. {datamule-0.429 → datamule-1.0.0}/datamule/config.py +0 -0
  37. {datamule-0.429 → datamule-1.0.0}/datamule/data/company_former_names.csv +0 -0
  38. {datamule-0.429 → datamule-1.0.0}/datamule/data/company_metadata.csv +0 -0
  39. {datamule-0.429 → datamule-1.0.0}/datamule/data/company_tickers.csv +0 -0
  40. {datamule-0.429 → datamule-1.0.0}/datamule/data/sec-glossary.csv +0 -0
  41. {datamule-0.429 → datamule-1.0.0}/datamule/data/xbrl_descriptions.csv +0 -0
  42. {datamule-0.429 → datamule-1.0.0}/datamule/helper.py +0 -0
  43. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/__init__.py +0 -0
  44. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/helper.py +0 -0
  45. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot.py +0 -0
  46. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/__init__.py +0 -0
  47. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/server.py +0 -0
  48. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -0
  49. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -0
  50. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -0
  51. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -0
  52. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -0
  53. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/main.js +0 -0
  54. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -0
  55. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -0
  56. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -0
  57. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -0
  58. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -0
  59. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/search.py +0 -0
  60. {datamule-0.429 → datamule-1.0.0}/datamule/mulebot/tools.py +0 -0
  61. {datamule-0.429 → datamule-1.0.0}/datamule/packageupdater.py +0 -0
  62. {datamule-0.429 → datamule-1.0.0}/datamule.egg-info/dependency_links.txt +0 -0
  63. {datamule-0.429 → datamule-1.0.0}/datamule.egg-info/top_level.txt +0 -0
  64. {datamule-0.429 → datamule-1.0.0}/setup.cfg +0 -0
@@ -1,10 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 0.429
3
+ Version: 1.0.0
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
7
7
  Provides-Extra: mulebot
8
8
  Provides-Extra: mulebot_server
9
- Provides-Extra: dataset_builder
10
9
  Provides-Extra: all
@@ -0,0 +1,47 @@
1
+ from .downloader.downloader import Downloader
2
+ from .downloader.premiumdownloader import PremiumDownloader
3
+ from .monitor import Monitor
4
+ from .packageupdater import PackageUpdater
5
+ from .submission import Submission
6
+ from .portfolio import Portfolio
7
+ from .document import Document
8
+ from secsgml import parse_sgml_submission
9
+ from .helper import load_package_csv, load_package_dataset
10
+ from .config import Config
11
+
12
+
13
+ # Keep the notebook environment setup
14
+ def _is_notebook_env():
15
+ """Check if the code is running in a Jupyter or Colab environment."""
16
+ try:
17
+ shell = get_ipython().__class__.__name__
18
+ return shell in ('ZMQInteractiveShell', 'Shell', 'Google.Colab')
19
+ except NameError:
20
+ return False
21
+
22
+ from functools import lru_cache
23
+
24
+ @lru_cache(maxsize=1)
25
+ def _setup_notebook_env():
26
+ """Setup Jupyter/Colab-specific configurations if needed."""
27
+ if _is_notebook_env():
28
+ import nest_asyncio
29
+ nest_asyncio.apply()
30
+
31
+ # Set up notebook environment
32
+ _setup_notebook_env()
33
+
34
+ __all__ = [
35
+ 'Downloader',
36
+ 'PremiumDownloader',
37
+ 'load_package_csv',
38
+ 'load_package_dataset',
39
+ 'Filing',
40
+ 'Portfolio',
41
+ 'Monitor',
42
+ 'PackageUpdater',
43
+ 'Submission',
44
+ 'Document',
45
+ 'parse_sgml_submission',
46
+ 'Config'
47
+ ]
@@ -0,0 +1,16 @@
1
+ # Streams data rather than downloading it.
2
+ # additional functionality such as query by xbrl, and other db
3
+ # also this is basically our experimental rework of portfolio w/o disturbing existing users
4
+ # this is highly experimental and may not work as expected
5
+ # only for datamule source
6
+ # likely new bottleneck will be local parsing() - will be bypassed in future when we have parsed archive
7
+ # wow parsed archive is going to be crazy fast - like every 10k in 1 minute.
8
+
9
+ class Book():
10
+ pass
11
+ def process_submissions(self,cik,ticker,sic,submission_type,document_type,date,
12
+ xbrl_query={},
13
+ metadata_callback=None,
14
+ document_callback=None,):
15
+ # grabs data and processes it
16
+ pass
@@ -0,0 +1,278 @@
1
+ import json
2
+ import csv
3
+ from .helper import convert_to_dashed_accession
4
+ import re
5
+ from doc2dict import xml2dict, txt2dict
6
+ from doc2dict.mapping import flatten_hierarchy
7
+ from .mapping_dicts import txt_mapping_dicts
8
+ from .mapping_dicts import xml_mapping_dicts
9
+ from selectolax.parser import HTMLParser
10
+
11
+ class Document:
12
+ def __init__(self, type, filename):
13
+ self.type = type
14
+ self.path = filename
15
+
16
+ self.data = None
17
+ self.content = None
18
+
19
+
20
+ def load_content(self,encoding='utf-8'):
21
+ with open(self.path, 'r',encoding=encoding) as f:
22
+ self.content = f.read()
23
+
24
+ def _load_text_content(self):
25
+ with open(self.path) as f:
26
+ return f.read().translate(str.maketrans({
27
+ '\xa0': ' ', '\u2003': ' ',
28
+ '\u2018': "'", '\u2019': "'",
29
+ '\u201c': '"', '\u201d': '"'
30
+ }))
31
+
32
+ # will deprecate this when we add html2dict
33
+ def _load_html_content(self):
34
+ with open(self.path,'rb') as f:
35
+ parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
36
+
37
+ # Remove hidden elements first
38
+ hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
39
+ for node in hidden_nodes:
40
+ node.decompose()
41
+
42
+ blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
43
+ lines = []
44
+ current_line = []
45
+
46
+ def flush_line():
47
+ if current_line:
48
+ # Don't add spaces between adjacent spans
49
+ lines.append(''.join(current_line))
50
+ current_line.clear()
51
+
52
+ for node in parser.root.traverse(include_text=True):
53
+ if node.tag in ('script', 'style', 'css'):
54
+ continue
55
+
56
+ if node.tag in blocks:
57
+ flush_line()
58
+ lines.append('')
59
+
60
+ if node.text_content:
61
+ text = node.text_content.strip()
62
+ if text:
63
+ if node.tag in blocks:
64
+ flush_line()
65
+ lines.append(text)
66
+ lines.append('')
67
+ else:
68
+ # Only add space if nodes aren't directly adjacent
69
+ if current_line and not current_line[-1].endswith(' '):
70
+ if node.prev and node.prev.text_content:
71
+ if node.parent != node.prev.parent or node.prev.next != node:
72
+ current_line.append(' ')
73
+ current_line.append(text)
74
+
75
+ flush_line()
76
+
77
+ text = '\n'.join(lines)
78
+ while '\n\n\n' in text:
79
+ text = text.replace('\n\n\n', '\n\n')
80
+
81
+ return text.translate(str.maketrans({
82
+ '\xa0': ' ', '\u2003': ' ',
83
+ '\u2018': "'", '\u2019': "'",
84
+ '\u201c': '"', '\u201d': '"'
85
+ }))
86
+
87
+ def _load_file_content(self):
88
+ if self.path.suffix =='.txt':
89
+ self.content = self._load_text_content()
90
+ elif self.path.suffix in ['.html','.htm']:
91
+ self.content = self._load_html_content()
92
+ else:
93
+ raise ValueError(f"Unsupported file type: {self.path.suffix}")
94
+
95
+
96
+ def contains_string(self, pattern):
97
+ """Currently only works for .htm, .html, and .txt files"""
98
+ if self.path.suffix in ['.htm', '.html', '.txt']:
99
+ if self.content is None:
100
+ self.content = self._load_file_content(self.path)
101
+ return bool(re.search(pattern, self.content))
102
+ return False
103
+
104
+ # Note: this method will be heavily modified in the future
105
+ def parse(self):
106
+ mapping_dict = None
107
+
108
+ if self.path.suffix == '.xml':
109
+ if self.type in ['3', '4', '5']:
110
+ mapping_dict = xml_mapping_dicts.dict_345
111
+
112
+ self.load_content()
113
+ self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
114
+ # will deprecate this when we add html2dict
115
+ elif self.path.suffix in ['.htm', '.html','.txt']:
116
+ self._load_file_content()
117
+
118
+ if self.type == '10-K':
119
+ mapping_dict = txt_mapping_dicts.dict_10k
120
+ elif self.type == '10-Q':
121
+ mapping_dict = txt_mapping_dicts.dict_10q
122
+ elif self.type == '8-K':
123
+ mapping_dict = txt_mapping_dicts.dict_8k
124
+ elif self.type == 'SC 13D':
125
+ mapping_dict = txt_mapping_dicts.dict_13d
126
+ elif self.type == 'SC 13G':
127
+ mapping_dict = txt_mapping_dicts.dict_13g
128
+
129
+ self.data = txt2dict(content=self.content, mapping_dict=mapping_dict)
130
+ return self.data
131
+
132
+ def write_json(self, output_filename=None):
133
+ if not self.data:
134
+ self.parse()
135
+
136
+ if output_filename is None:
137
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
138
+
139
+ with open(output_filename, 'w',encoding='utf-8') as f:
140
+ json.dump(self.data, f, indent=2)
141
+
142
+ def write_csv(self, output_filename=None, accession_number=None):
143
+ self.parse()
144
+
145
+ if output_filename is None:
146
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
147
+
148
+ with open(output_filename, 'w', newline='') as csvfile:
149
+ if not self.data:
150
+ return output_filename
151
+
152
+ has_document = any('document' in item for item in self.data)
153
+
154
+ if has_document and 'document' in self.data:
155
+ writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
156
+ writer.writeheader()
157
+ flattened = self._flatten_dict(self.data['document'])
158
+ for section, text in flattened.items():
159
+ writer.writerow({'section': section, 'text': text})
160
+ else:
161
+ fieldnames = list(self.data[0].keys())
162
+ if accession_number:
163
+ fieldnames.append('Accession Number')
164
+ writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
165
+ writer.writeheader()
166
+ for row in self.data:
167
+ if accession_number:
168
+ row['Accession Number'] = convert_to_dashed_accession(accession_number)
169
+ writer.writerow(row)
170
+
171
+ return output_filename
172
+
173
+ def _document_to_section_text(self, document_data, parent_key=''):
174
+ items = []
175
+
176
+ if isinstance(document_data, dict):
177
+ for key, value in document_data.items():
178
+ # Build the section name
179
+ section = f"{parent_key}_{key}" if parent_key else key
180
+
181
+ # If the value is a dict, recurse
182
+ if isinstance(value, dict):
183
+ items.extend(self._document_to_section_text(value, section))
184
+ # If it's a list, handle each item
185
+ elif isinstance(value, list):
186
+ for i, item in enumerate(value):
187
+ if isinstance(item, dict):
188
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
189
+ else:
190
+ items.append({
191
+ 'section': f"{section}_{i+1}",
192
+ 'text': str(item)
193
+ })
194
+ # Base case - add the item
195
+ else:
196
+ items.append({
197
+ 'section': section,
198
+ 'text': str(value)
199
+ })
200
+
201
+ return items
202
+
203
+ # we'll modify this for every dict
204
+ def _flatten_dict(self, d, parent_key=''):
205
+ items = {}
206
+
207
+ if isinstance(d, list):
208
+ return [self._flatten_dict(item) for item in d]
209
+
210
+ for k, v in d.items():
211
+ new_key = f"{parent_key}_{k}" if parent_key else k
212
+
213
+ if isinstance(v, dict):
214
+ items.update(self._flatten_dict(v, new_key))
215
+ else:
216
+ items[new_key] = str(v)
217
+
218
+ return items
219
+
220
+ # this will all have to be changed. default will be to flatten everything
221
+ def __iter__(self):
222
+ if not self.data:
223
+ self.parse()
224
+
225
+ # Let's remove XML iterable for now
226
+
227
+ # Handle text-based documents
228
+ if self.path.suffix in ['.txt', '.htm', '.html']:
229
+ document_data = self.data
230
+ if not document_data:
231
+ return iter([])
232
+
233
+ # Find highest hierarchy level from mapping dict
234
+ highest_hierarchy = float('inf')
235
+ section_type = None
236
+
237
+ if self.type in ['10-K', '10-Q']:
238
+ mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
239
+ elif self.type == '8-K':
240
+ mapping_dict = txt_mapping_dicts.dict_8k
241
+ elif self.type == 'SC 13D':
242
+ mapping_dict = txt_mapping_dicts.dict_13d
243
+ elif self.type == 'SC 13G':
244
+ mapping_dict = txt_mapping_dicts.dict_13g
245
+ else:
246
+ return iter([])
247
+
248
+ # Find section type with highest hierarchy number
249
+ highest_hierarchy = -1 # Start at -1 to find highest
250
+ for mapping in mapping_dict['rules']['mappings']:
251
+ if mapping.get('hierarchy') is not None:
252
+ if mapping['hierarchy'] > highest_hierarchy:
253
+ highest_hierarchy = mapping['hierarchy']
254
+ section_type = mapping['name']
255
+
256
+ if not section_type:
257
+ return iter([])
258
+
259
+ # Extract sections of the identified type
260
+ def find_sections(data, target_type):
261
+ sections = []
262
+ if isinstance(data, dict):
263
+ if data.get('type') == target_type:
264
+ sections.append({
265
+ 'item': data.get('text', ''),
266
+ 'text': flatten_hierarchy(data.get('content', []))
267
+ })
268
+ for value in data.values():
269
+ if isinstance(value, (dict, list)):
270
+ sections.extend(find_sections(value, target_type))
271
+ elif isinstance(data, list):
272
+ for item in data:
273
+ sections.extend(find_sections(item, target_type))
274
+ return sections
275
+
276
+ return iter(find_sections(document_data, section_type))
277
+
278
+ return iter([])
@@ -10,7 +10,7 @@ import time
10
10
  from collections import deque
11
11
 
12
12
  from ..helper import identifier_to_cik, load_package_csv, fix_filing_url, headers
13
- from ..parser.sgml_parsing.sgml_parser_cy import parse_sgml_submission
13
+ from secsgml import parse_sgml_submission
14
14
 
15
15
  class RetryException(Exception):
16
16
  def __init__(self, url, retry_after=601):
@@ -122,8 +122,8 @@ class Downloader:
122
122
  raise RetryException(url)
123
123
  raise
124
124
 
125
- async def _get_filing_urls_from_efts(self, base_url):
126
- """Fetch filing URLs from EFTS in batches."""
125
+ async def _get_filing_urls_from_efts(self, base_url, submission_type=None):
126
+ """Fetch filing URLs from EFTS in batches with form type filtering."""
127
127
  start = 0
128
128
  page_size = 100
129
129
  urls = []
@@ -152,12 +152,22 @@ class Downloader:
152
152
  if data and 'hits' in data:
153
153
  hits = data['hits']['hits']
154
154
  if hits:
155
+ # Filter hits based on exact form match
156
+ if not submission_type or submission_type == "-0":
157
+ filtered_hits = hits
158
+ else:
159
+ requested_forms = [submission_type] if isinstance(submission_type, str) else submission_type
160
+ filtered_hits = [
161
+ hit for hit in hits
162
+ if hit['_source'].get('form', '') in requested_forms
163
+ ]
164
+
155
165
  batch_urls = [
156
166
  f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}.txt"
157
- for hit in hits
167
+ for hit in filtered_hits
158
168
  ]
159
169
  urls.extend(batch_urls)
160
- pbar.update(len(hits))
170
+ pbar.update(len(hits)) # Update progress based on total hits processed
161
171
  self.update_progress_description()
162
172
 
163
173
  start += 10 * page_size
@@ -173,7 +183,7 @@ class Downloader:
173
183
  pbar.close()
174
184
  self.current_pbar = None
175
185
  return urls
176
-
186
+
177
187
  async def _download_file(self, url, filepath):
178
188
  """Download single file with precise rate limiting."""
179
189
  async with self.connection_semaphore:
@@ -197,7 +207,7 @@ class Downloader:
197
207
 
198
208
  parsed_data = parse_sgml_submission(
199
209
  content=content.decode(),
200
- output_dir=os.path.dirname(filepath) + f'/{url.split("/")[-1].split(".")[0].replace("-", "")}'
210
+ output_dir=os.path.dirname(filepath)
201
211
  )
202
212
 
203
213
  try:
@@ -306,7 +316,7 @@ class Downloader:
306
316
  base_url = "https://efts.sec.gov/LATEST/search-index"
307
317
  efts_url = f"{base_url}?{urlencode(params, doseq=True)}"
308
318
 
309
- urls = await self._get_filing_urls_from_efts(efts_url)
319
+ urls = await self._get_filing_urls_from_efts(efts_url,submission_type)
310
320
  if urls:
311
321
  filepaths, parsed_data = await self._download_and_process(urls, output_dir)
312
322
  all_filepaths.extend(filepaths)
@@ -13,7 +13,7 @@ from concurrent.futures import ThreadPoolExecutor
13
13
  from functools import partial
14
14
  from queue import Queue, Empty
15
15
  from threading import Thread
16
- from datamule.parser.sgml_parsing.sgml_parser_cy import parse_sgml_submission
16
+ from secsgml import parse_sgml_submission
17
17
  import urllib.parse
18
18
  from ..helper import identifier_to_cik
19
19
 
@@ -127,12 +127,13 @@ class PremiumDownloader:
127
127
 
128
128
  def _process_file(self, item):
129
129
  filename, content = item
130
- clean_name = filename[:-4] if filename.endswith('.zst') else filename
131
- output_path = os.path.join(self.output_dir, Path(clean_name).stem)
132
130
  try:
133
- parse_sgml_submission(None, output_dir=output_path, content=content)
131
+ parse_sgml_submission(output_dir=self.output_dir, content=content)
134
132
  self.pbar.update(1)
135
133
  except Exception as e:
134
+ accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
135
+ if os.path.exists(accession_dir):
136
+ shutil.rmtree(accession_dir)
136
137
  self.downloader._log_error(self.output_dir, filename, str(e))
137
138
 
138
139
  def _processing_worker(self):
@@ -259,7 +260,8 @@ class PremiumDownloader:
259
260
  keepalive_timeout=60
260
261
  )
261
262
 
262
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=3600)) as session:
263
+ # timeout should be max 2 hours.
264
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=7200)) as session:
263
265
  tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
264
266
  await asyncio.gather(*tasks, return_exceptions=True)
265
267
 
@@ -281,6 +283,7 @@ class PremiumDownloader:
281
283
  total_urls.extend(more_urls)
282
284
 
283
285
  if total_urls:
286
+ total_urls = list(set(total_urls)) # Remove duplicates
284
287
  start_time = time.time()
285
288
  await self.process_batch(total_urls, output_dir)
286
289
  elapsed_time = time.time() - start_time