pybgpkitstream 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pybgpkitstream
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Drop-in replacement for PyBGPStream using BGPKIT
5
5
  Author: JustinLoye
6
6
  Author-email: JustinLoye <jloye@iij.ad.jp>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pybgpkitstream"
3
- version = "0.1.2"
3
+ version = "0.1.4"
4
4
  description = "Drop-in replacement for PyBGPStream using BGPKIT"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,6 +9,7 @@ from heapq import merge
9
9
  from operator import itemgetter
10
10
  import binascii
11
11
  import logging
12
+ from tempfile import TemporaryDirectory
12
13
 
13
14
  import aiohttp
14
15
  import bgpkit
@@ -44,24 +45,39 @@ def crc32(input_str: str):
44
45
  return f"{crc:08x}"
45
46
 
46
47
 
48
+ class Directory:
49
+ """Permanent directory that mimics TemporaryDirectory interface."""
50
+
51
+ def __init__(self, path):
52
+ self.name = str(path)
53
+
54
+ def cleanup(self):
55
+ """No-op cleanup for permanent directories."""
56
+ pass
57
+
58
+
47
59
  class BGPKITStream:
48
60
  def __init__(
49
61
  self,
50
- ts_start: datetime.datetime,
51
- ts_end: datetime.datetime,
62
+ ts_start: float,
63
+ ts_end: float,
52
64
  collector_id: str,
53
65
  data_type: list[Literal["update", "rib"]],
54
66
  cache_dir: str | None,
55
67
  filters: dict = {},
56
68
  max_concurrent_downloads: int = 10,
69
+ chunk_time: float | None = datetime.timedelta(hours=2).seconds,
57
70
  ):
58
71
  self.ts_start = ts_start
59
72
  self.ts_end = ts_end
60
73
  self.collector_id = collector_id
61
74
  self.data_type = data_type
62
- self.cache_dir = cache_dir
75
+ self.cache_dir: Directory | TemporaryDirectory = (
76
+ Directory(cache_dir) if cache_dir else TemporaryDirectory()
77
+ )
63
78
  self.filters = filters
64
79
  self.max_concurrent_downloads = max_concurrent_downloads
80
+ self.chunk_time = chunk_time
65
81
 
66
82
  self.broker = bgpkit.Broker()
67
83
 
@@ -70,7 +86,6 @@ class BGPKITStream:
70
86
  """Generate a cache filename compatible with BGPKIT parser."""
71
87
 
72
88
  hash_suffix = crc32(url)
73
- print(url)
74
89
 
75
90
  if "updates." in url:
76
91
  data_type = "updates"
@@ -140,7 +155,7 @@ class BGPKITStream:
140
155
  for rc, rc_urls in self.urls[data_type].items():
141
156
  for url in rc_urls:
142
157
  filename = self._generate_cache_filename(url)
143
- filepath = os.path.join(self.cache_dir, filename)
158
+ filepath = os.path.join(self.cache_dir.name, filename)
144
159
 
145
160
  if os.path.exists(filepath):
146
161
  logging.debug(f"{filepath} is a cache hit")
@@ -171,47 +186,79 @@ class BGPKITStream:
171
186
  return ((elem.timestamp, elem, is_rib, collector) for elem in iterator)
172
187
 
173
188
  def __iter__(self) -> Iterator[BGPElement]:
174
- self._set_urls()
189
+ # try/finally to cleanup the fetching cache
190
+ try:
191
+ # Manager mode: spawn smaller worker streams to balance fetch/parse
192
+ if self.chunk_time:
193
+ current = self.ts_start
194
+
195
+ while current < self.ts_end:
196
+ chunk_end = min(current + self.chunk_time, self.ts_end)
197
+
198
+ logging.info(
199
+ f"Processing chunk: {datetime.datetime.fromtimestamp(current)} "
200
+ f"to {datetime.datetime.fromtimestamp(chunk_end)}"
201
+ )
202
+
203
+ worker = type(self)(
204
+ ts_start=current,
205
+ ts_end=chunk_end
206
+ - 1, # remove one second because BGPKIT include border
207
+ collector_id=self.collector_id,
208
+ data_type=self.data_type,
209
+ cache_dir=None,
210
+ filters=self.filters,
211
+ max_concurrent_downloads=self.max_concurrent_downloads,
212
+ chunk_time=None, # Worker doesn't chunk itself
213
+ )
214
+
215
+ yield from worker
216
+ current = chunk_end + 1e-7
217
+
218
+ return
219
+
220
+ self._set_urls()
175
221
 
176
- if self.cache_dir:
177
222
  asyncio.run(self._prefetch_data())
178
223
 
179
- # One iterator for each data_type * collector combinations
180
- # To be merged according to the elements timestamp
181
- iterators_to_merge = []
224
+ # One iterator for each data_type * collector combinations
225
+ # To be merged according to the elements timestamp
226
+ iterators_to_merge = []
182
227
 
183
- for data_type in self.data_type:
184
- is_rib = data_type == "rib"
185
-
186
- # Get rib or update files per collector
187
- if self.cache_dir:
188
- rc_to_urls = self.paths[data_type]
189
- else:
190
- rc_to_urls = self.urls[data_type]
191
-
192
- # Chain rib or update iterators to get one stream per collector / data_type
193
- for rc, urls in rc_to_urls.items():
194
- parsers = [bgpkit.Parser(url=url, filters=self.filters) for url in urls]
195
-
196
- chained_iterator = chain.from_iterable(parsers)
197
-
198
- # Add metadata lost by bgpkit for compatibility with pubgpstream
199
- iterators_to_merge.append((chained_iterator, is_rib, rc))
200
-
201
- # Make a generator to tag each bgpkit element with metadata
202
- # Benefit 1: full compat with pybgpstream
203
- # Benefit 2: we give a key easy to access for heapq to merge
204
- tagged_iterators = [
205
- self._create_tagged_iterator(it, is_rib, rc)
206
- for it, is_rib, rc in iterators_to_merge
207
- ]
208
-
209
- # Merge and convert to pybgpstream format
210
- for timestamp, bgpkit_elem, is_rib, rc in merge(
211
- *tagged_iterators, key=itemgetter(0)
212
- ):
213
- if self.ts_start <= timestamp <= self.ts_end:
214
- yield convert_bgpkit_elem(bgpkit_elem, is_rib, rc)
228
+ for data_type in self.data_type:
229
+ is_rib = data_type == "rib"
230
+
231
+ # Get rib or update files per collector
232
+ rc_to_paths = self.paths[data_type]
233
+
234
+ # Chain rib or update iterators to get one stream per collector / data_type
235
+ for rc, paths in rc_to_paths.items():
236
+ parsers = [
237
+ bgpkit.Parser(url=path, filters=self.filters) for path in paths
238
+ ]
239
+
240
+ chained_iterator = chain.from_iterable(parsers)
241
+
242
+ # Add metadata lost by bgpkit for compatibility with pubgpstream
243
+ iterators_to_merge.append((chained_iterator, is_rib, rc))
244
+
245
+ # Make a generator to tag each bgpkit element with metadata
246
+ # Benefit 1: full compat with pybgpstream
247
+ # Benefit 2: we give a key easy to access for heapq to merge
248
+ tagged_iterators = [
249
+ self._create_tagged_iterator(it, is_rib, rc)
250
+ for it, is_rib, rc in iterators_to_merge
251
+ ]
252
+
253
+ # Merge and convert to pybgpstream format
254
+ for timestamp, bgpkit_elem, is_rib, rc in merge(
255
+ *tagged_iterators, key=itemgetter(0)
256
+ ):
257
+ if self.ts_start <= timestamp <= self.ts_end:
258
+ yield convert_bgpkit_elem(bgpkit_elem, is_rib, rc)
259
+
260
+ finally:
261
+ self.cache_dir.cleanup()
215
262
 
216
263
  @classmethod
217
264
  def from_config(cls, config: BGPStreamConfig):
@@ -229,4 +276,5 @@ class BGPKITStream:
229
276
  max_concurrent_downloads=config.max_concurrent_downloads
230
277
  if config.max_concurrent_downloads
231
278
  else 10,
279
+ chunk_time=config.chunk_time.seconds if config.chunk_time else None,
232
280
  )
@@ -63,4 +63,10 @@ class BGPStreamConfig(BaseModel):
63
63
  description="Specifies the directory for caching downloaded files.",
64
64
  )
65
65
  filters: FilterOptions | None = Field(default=None, description="Optional filters")
66
- max_concurrent_downloads: int | None = Field(default=None, description="Maximum concurrent downloads when caching")
66
+ max_concurrent_downloads: int | None = Field(
67
+ default=None, description="Maximum concurrent downloads when caching"
68
+ )
69
+ chunk_time: datetime.timedelta | None = Field(
70
+ default=datetime.timedelta(hours=2),
71
+ description="Interval for the fetch/parse cycle (avoid long prefetch time)",
72
+ )
File without changes