pybgpkitstream 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pybgpkitstream
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Drop-in replacement for PyBGPStream using BGPKIT
5
5
  Author: JustinLoye
6
6
  Author-email: JustinLoye <jloye@iij.ad.jp>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pybgpkitstream"
3
- version = "0.1.3"
3
+ version = "0.1.5"
4
4
  description = "Drop-in replacement for PyBGPStream using BGPKIT"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -9,6 +9,7 @@ from heapq import merge
9
9
  from operator import itemgetter
10
10
  import binascii
11
11
  import logging
12
+ from tempfile import TemporaryDirectory
12
13
 
13
14
  import aiohttp
14
15
  import bgpkit
@@ -44,6 +45,17 @@ def crc32(input_str: str):
44
45
  return f"{crc:08x}"
45
46
 
46
47
 
48
+ class Directory:
49
+ """Permanent directory that mimics TemporaryDirectory interface."""
50
+
51
+ def __init__(self, path):
52
+ self.name = str(path)
53
+
54
+ def cleanup(self):
55
+ """No-op cleanup for permanent directories."""
56
+ pass
57
+
58
+
47
59
  class BGPKITStream:
48
60
  def __init__(
49
61
  self,
@@ -60,7 +72,9 @@ class BGPKITStream:
60
72
  self.ts_end = ts_end
61
73
  self.collector_id = collector_id
62
74
  self.data_type = data_type
63
- self.cache_dir = cache_dir
75
+ self.cache_dir: Directory | TemporaryDirectory = (
76
+ Directory(cache_dir) if cache_dir else TemporaryDirectory()
77
+ )
64
78
  self.filters = filters
65
79
  self.max_concurrent_downloads = max_concurrent_downloads
66
80
  self.chunk_time = chunk_time
@@ -72,7 +86,6 @@ class BGPKITStream:
72
86
  """Generate a cache filename compatible with BGPKIT parser."""
73
87
 
74
88
  hash_suffix = crc32(url)
75
- print(url)
76
89
 
77
90
  if "updates." in url:
78
91
  data_type = "updates"
@@ -103,9 +116,8 @@ class BGPKITStream:
103
116
  self.urls = {"rib": defaultdict(list), "update": defaultdict(list)}
104
117
  for data_type in self.data_type:
105
118
  items: list[BrokerItem] = self.broker.query(
106
- ts_start=datetime.datetime.fromtimestamp(self.ts_start)
107
- - datetime.timedelta(minutes=1),
108
- ts_end=datetime.datetime.fromtimestamp(self.ts_end),
119
+ ts_start=int(self.ts_start - 60),
120
+ ts_end=int(self.ts_end),
109
121
  collector_id=self.collector_id,
110
122
  data_type=data_type,
111
123
  )
@@ -142,7 +154,7 @@ class BGPKITStream:
142
154
  for rc, rc_urls in self.urls[data_type].items():
143
155
  for url in rc_urls:
144
156
  filename = self._generate_cache_filename(url)
145
- filepath = os.path.join(self.cache_dir, filename)
157
+ filepath = os.path.join(self.cache_dir.name, filename)
146
158
 
147
159
  if os.path.exists(filepath):
148
160
  logging.debug(f"{filepath} is a cache hit")
@@ -173,76 +185,79 @@ class BGPKITStream:
173
185
  return ((elem.timestamp, elem, is_rib, collector) for elem in iterator)
174
186
 
175
187
  def __iter__(self) -> Iterator[BGPElement]:
176
- # Manager mode: spawn smaller worker streams to balance fetch/parse
177
- if self.chunk_time:
178
- current = self.ts_start
179
-
180
- while current < self.ts_end:
181
- chunk_end = min(current + self.chunk_time, self.ts_end)
182
-
183
- logging.info(
184
- f"Processing chunk: {datetime.datetime.fromtimestamp(current)} "
185
- f"to {datetime.datetime.fromtimestamp(chunk_end)}"
186
- )
187
-
188
- worker = type(self)(
189
- ts_start=current,
190
- ts_end=chunk_end
191
- - 1, # remove one second because BGPKIT include border
192
- collector_id=self.collector_id,
193
- data_type=self.data_type,
194
- cache_dir=self.cache_dir,
195
- filters=self.filters,
196
- max_concurrent_downloads=self.max_concurrent_downloads,
197
- chunk_time=None, # Worker doesn't chunk itself
198
- )
199
-
200
- yield from worker
201
- current = chunk_end + 1e-7
188
+ # try/finally to cleanup the fetching cache
189
+ try:
190
+ # Manager mode: spawn smaller worker streams to balance fetch/parse
191
+ if self.chunk_time:
192
+ current = self.ts_start
193
+
194
+ while current < self.ts_end:
195
+ chunk_end = min(current + self.chunk_time, self.ts_end)
196
+
197
+ logging.info(
198
+ f"Processing chunk: {datetime.datetime.fromtimestamp(current)} "
199
+ f"to {datetime.datetime.fromtimestamp(chunk_end)}"
200
+ )
201
+
202
+ worker = type(self)(
203
+ ts_start=current,
204
+ ts_end=chunk_end
205
+ - 1, # remove one second because BGPKIT include border
206
+ collector_id=self.collector_id,
207
+ data_type=self.data_type,
208
+ cache_dir=None,
209
+ filters=self.filters,
210
+ max_concurrent_downloads=self.max_concurrent_downloads,
211
+ chunk_time=None, # Worker doesn't chunk itself
212
+ )
213
+
214
+ yield from worker
215
+ current = chunk_end + 1e-7
216
+
217
+ return
218
+
219
+ self._set_urls()
202
220
 
203
- return
204
-
205
- self._set_urls()
206
-
207
- if self.cache_dir:
208
221
  asyncio.run(self._prefetch_data())
209
222
 
210
- # One iterator for each data_type * collector combinations
211
- # To be merged according to the elements timestamp
212
- iterators_to_merge = []
223
+ # One iterator for each data_type * collector combinations
224
+ # To be merged according to the elements timestamp
225
+ iterators_to_merge = []
213
226
 
214
- for data_type in self.data_type:
215
- is_rib = data_type == "rib"
216
-
217
- # Get rib or update files per collector
218
- if self.cache_dir:
219
- rc_to_urls = self.paths[data_type]
220
- else:
221
- rc_to_urls = self.urls[data_type]
222
-
223
- # Chain rib or update iterators to get one stream per collector / data_type
224
- for rc, urls in rc_to_urls.items():
225
- parsers = [bgpkit.Parser(url=url, filters=self.filters) for url in urls]
226
-
227
- chained_iterator = chain.from_iterable(parsers)
228
-
229
- # Add metadata lost by bgpkit for compatibility with pubgpstream
230
- iterators_to_merge.append((chained_iterator, is_rib, rc))
231
-
232
- # Make a generator to tag each bgpkit element with metadata
233
- # Benefit 1: full compat with pybgpstream
234
- # Benefit 2: we give a key easy to access for heapq to merge
235
- tagged_iterators = [
236
- self._create_tagged_iterator(it, is_rib, rc)
237
- for it, is_rib, rc in iterators_to_merge
238
- ]
239
-
240
- # Merge and convert to pybgpstream format
241
- for timestamp, bgpkit_elem, is_rib, rc in merge(
242
- *tagged_iterators, key=itemgetter(0)
243
- ):
244
- if self.ts_start <= timestamp <= self.ts_end:
245
- yield convert_bgpkit_elem(bgpkit_elem, is_rib, rc)
227
+ for data_type in self.data_type:
228
+ is_rib = data_type == "rib"
229
+
230
+ # Get rib or update files per collector
231
+ rc_to_paths = self.paths[data_type]
232
+
233
+ # Chain rib or update iterators to get one stream per collector / data_type
234
+ for rc, paths in rc_to_paths.items():
235
+ parsers = [
236
+ bgpkit.Parser(url=path, filters=self.filters) for path in paths
237
+ ]
238
+
239
+ chained_iterator = chain.from_iterable(parsers)
240
+
241
+ # Add metadata lost by bgpkit for compatibility with pubgpstream
242
+ iterators_to_merge.append((chained_iterator, is_rib, rc))
243
+
244
+ # Make a generator to tag each bgpkit element with metadata
245
+ # Benefit 1: full compat with pybgpstream
246
+ # Benefit 2: we give a key easy to access for heapq to merge
247
+ tagged_iterators = [
248
+ self._create_tagged_iterator(it, is_rib, rc)
249
+ for it, is_rib, rc in iterators_to_merge
250
+ ]
251
+
252
+ # Merge and convert to pybgpstream format
253
+ for timestamp, bgpkit_elem, is_rib, rc in merge(
254
+ *tagged_iterators, key=itemgetter(0)
255
+ ):
256
+ if self.ts_start <= timestamp <= self.ts_end:
257
+ yield convert_bgpkit_elem(bgpkit_elem, is_rib, rc)
258
+
259
+ finally:
260
+ self.cache_dir.cleanup()
246
261
 
247
262
  @classmethod
248
263
  def from_config(cls, config: BGPStreamConfig):
@@ -1,5 +1,5 @@
1
1
  import datetime
2
- from pydantic import BaseModel, Field, DirectoryPath
2
+ from pydantic import BaseModel, Field, DirectoryPath, field_validator
3
3
  from typing import Literal
4
4
  from ipaddress import IPv4Address, IPv6Address
5
5
 
@@ -70,3 +70,13 @@ class BGPStreamConfig(BaseModel):
70
70
  default=datetime.timedelta(hours=2),
71
71
  description="Interval for the fetch/parse cycle (avoid long prefetch time)",
72
72
  )
73
+
74
+ @field_validator("start_time", "end_time")
75
+ @classmethod
76
+ def normalize_to_utc(cls, dt: datetime.datetime) -> datetime.datetime:
77
+ # if naive datetime (not timezone-aware) assume it's UTC
78
+ if dt.tzinfo is None:
79
+ return dt.replace(tzinfo=datetime.timezone.utc)
80
+ # if timezone-aware, convert to utc
81
+ else:
82
+ return dt.astimezone(datetime.timezone.utc)
File without changes