pybgpkitstream 0.1.6__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- from .bgpstreamconfig import BGPStreamConfig, FilterOptions
1
+ from .bgpstreamconfig import BGPStreamConfig, FilterOptions, PyBGPKITStreamConfig
2
2
  from .bgpkitstream import BGPKITStream
3
3
 
4
- __all__ = ["BGPStreamConfig", "FilterOptions", "BGPKITStream"]
4
+ __all__ = ["BGPStreamConfig", "FilterOptions", "BGPKITStream", "PyBGPKITStreamConfig"]
@@ -6,7 +6,7 @@ from typing import Iterator, Literal
6
6
  from collections import defaultdict
7
7
  from itertools import chain
8
8
  from heapq import merge
9
- from operator import itemgetter
9
+ from operator import attrgetter, itemgetter
10
10
  import binascii
11
11
  import logging
12
12
  from tempfile import TemporaryDirectory
@@ -15,8 +15,27 @@ import aiohttp
15
15
  import bgpkit
16
16
  from bgpkit.bgpkit_broker import BrokerItem
17
17
 
18
- from pybgpkitstream.bgpstreamconfig import BGPStreamConfig
18
+ from pybgpkitstream.bgpstreamconfig import (
19
+ BGPStreamConfig,
20
+ FilterOptions,
21
+ PyBGPKITStreamConfig,
22
+ )
19
23
  from pybgpkitstream.bgpelement import BGPElement
24
+ from pybgpkitstream.bgpparser import (
25
+ BGPParser,
26
+ PyBGPKITParser,
27
+ BGPKITParser,
28
+ PyBGPStreamParser,
29
+ BGPdumpParser,
30
+ )
31
+ from pybgpkitstream.utils import dt_from_filepath
32
+
33
+ name2parser = {
34
+ "pybgpkit": PyBGPKITParser,
35
+ "bgpkit": BGPKITParser,
36
+ "pybgpstream": PyBGPStreamParser,
37
+ "bgpdump": BGPdumpParser,
38
+ }
20
39
 
21
40
 
22
41
  logger = logging.getLogger(__name__)
@@ -72,25 +91,40 @@ class BGPKITStream:
72
91
  ts_end: float,
73
92
  collector_id: str,
74
93
  data_type: list[Literal["update", "rib"]],
75
- cache_dir: str | None,
76
- filters: dict = {},
77
- max_concurrent_downloads: int = 10,
94
+ filters: FilterOptions | None,
95
+ cache_dir: str | None = None,
96
+ max_concurrent_downloads: int | None = 10,
78
97
  chunk_time: float | None = datetime.timedelta(hours=2).seconds,
98
+ ram_fetch: bool | None = True,
99
+ parser_name: str | None = "pybgpkit",
79
100
  ):
101
+ # Stream config
80
102
  self.ts_start = ts_start
81
103
  self.ts_end = ts_end
82
104
  self.collector_id = collector_id
83
105
  self.data_type = data_type
84
- self.cache_dir: Directory | TemporaryDirectory = (
85
- Directory(cache_dir)
86
- if cache_dir
87
- else TemporaryDirectory(dir=get_shared_memory())
88
- )
106
+ if not filters:
107
+ filters = FilterOptions()
89
108
  self.filters = filters
109
+
110
+ # Implementation config
90
111
  self.max_concurrent_downloads = max_concurrent_downloads
91
112
  self.chunk_time = chunk_time
113
+ self.ram_fetch = ram_fetch
114
+ if cache_dir:
115
+ self.cache_dir = Directory(cache_dir)
116
+ else:
117
+ if ram_fetch:
118
+ self.cache_dir = TemporaryDirectory(dir=get_shared_memory())
119
+ else:
120
+ self.cache_dir = TemporaryDirectory()
121
+ if not parser_name:
122
+ self.parser_name = "pybgpkit"
123
+ else:
124
+ self.parser_name = parser_name
92
125
 
93
126
  self.broker = bgpkit.Broker()
127
+ self.parser_cls: BGPParser = name2parser[parser_name]
94
128
 
95
129
  @staticmethod
96
130
  def _generate_cache_filename(url):
@@ -191,11 +225,14 @@ class BGPKITStream:
191
225
  self.paths[data_type][rc].append(filepath)
192
226
  logging.info("All downloads finished.")
193
227
 
194
- def _create_tagged_iterator(self, iterator, is_rib, collector):
195
- """Creates a generator that tags elements with metadata missing in bgpkit."""
196
- return ((elem.timestamp, elem, is_rib, collector) for elem in iterator)
228
+ def __iter__(self):
229
+ if "update" in self.data_type:
230
+ return self._iter_update()
231
+ else:
232
+ return self._iter_rib()
197
233
 
198
- def __iter__(self) -> Iterator[BGPElement]:
234
+ def _iter_update(self) -> Iterator[BGPElement]:
235
+ # __iter__ for data types [ribs, updates] or [updates]
199
236
  # try/finally to cleanup the fetching cache
200
237
  try:
201
238
  # Manager mode: spawn smaller worker streams to balance fetch/parse
@@ -209,17 +246,20 @@ class BGPKITStream:
209
246
  f"Processing chunk: {datetime.datetime.fromtimestamp(current)} "
210
247
  f"to {datetime.datetime.fromtimestamp(chunk_end)}"
211
248
  )
212
-
213
249
  worker = type(self)(
214
250
  ts_start=current,
215
251
  ts_end=chunk_end
216
252
  - 1, # remove one second because BGPKIT include border
217
253
  collector_id=self.collector_id,
218
254
  data_type=self.data_type,
219
- cache_dir=None,
255
+ cache_dir=self.cache_dir.name
256
+ if isinstance(self.cache_dir, Directory)
257
+ else None,
220
258
  filters=self.filters,
221
259
  max_concurrent_downloads=self.max_concurrent_downloads,
222
260
  chunk_time=None, # Worker doesn't chunk itself
261
+ ram_fetch=self.ram_fetch,
262
+ parser_name=self.parser_name,
223
263
  )
224
264
 
225
265
  yield from worker
@@ -228,7 +268,6 @@ class BGPKITStream:
228
268
  return
229
269
 
230
270
  self._set_urls()
231
-
232
271
  asyncio.run(self._prefetch_data())
233
272
 
234
273
  # One iterator for each data_type * collector combinations
@@ -243,48 +282,115 @@ class BGPKITStream:
243
282
 
244
283
  # Chain rib or update iterators to get one stream per collector / data_type
245
284
  for rc, paths in rc_to_paths.items():
285
+ # Don't use a generator here. parsers are lazy anyway
246
286
  parsers = [
247
- bgpkit.Parser(url=path, filters=self.filters) for path in paths
287
+ self.parser_cls(path, is_rib, rc, filters=self.filters)
288
+ for path in paths
248
289
  ]
249
290
 
250
291
  chained_iterator = chain.from_iterable(parsers)
251
292
 
252
293
  # Add metadata lost by bgpkit for compatibility with pubgpstream
253
- iterators_to_merge.append((chained_iterator, is_rib, rc))
254
-
255
- # Make a generator to tag each bgpkit element with metadata
256
- # Benefit 1: full compat with pybgpstream
257
- # Benefit 2: we give a key easy to access for heapq to merge
258
- tagged_iterators = [
259
- self._create_tagged_iterator(it, is_rib, rc)
260
- for it, is_rib, rc in iterators_to_merge
261
- ]
262
-
263
- # Merge and convert to pybgpstream format
264
- for timestamp, bgpkit_elem, is_rib, rc in merge(
265
- *tagged_iterators, key=itemgetter(0)
266
- ):
267
- if self.ts_start <= timestamp <= self.ts_end:
268
- yield convert_bgpkit_elem(bgpkit_elem, is_rib, rc)
294
+ # iterators_to_merge.append((chained_iterator, is_rib, rc))
295
+ iterators_to_merge.append(chained_iterator)
269
296
 
297
+ for bgpelem in merge(*iterators_to_merge, key=attrgetter("time")):
298
+ if self.ts_start <= bgpelem.time <= self.ts_end:
299
+ yield bgpelem
300
+ finally:
301
+ self.cache_dir.cleanup()
302
+
303
+ def _iter_rib(self) -> Iterator[BGPElement]:
304
+ # __iter__ for data types [ribs]
305
+ # try/finally to cleanup the fetching cache
306
+ try:
307
+ # Manager mode: spawn smaller worker streams to balance fetch/parse
308
+ if self.chunk_time:
309
+ current = self.ts_start
310
+
311
+ while current < self.ts_end:
312
+ chunk_end = min(current + self.chunk_time, self.ts_end)
313
+
314
+ logging.info(
315
+ f"Processing chunk: {datetime.datetime.fromtimestamp(current)} "
316
+ f"to {datetime.datetime.fromtimestamp(chunk_end)}"
317
+ )
318
+ worker = type(self)(
319
+ ts_start=current,
320
+ ts_end=chunk_end
321
+ - 1, # remove one second because BGPKIT include border
322
+ collector_id=self.collector_id,
323
+ data_type=self.data_type,
324
+ cache_dir=self.cache_dir.name
325
+ if isinstance(self.cache_dir, Directory)
326
+ else None,
327
+ filters=self.filters,
328
+ max_concurrent_downloads=self.max_concurrent_downloads,
329
+ chunk_time=None, # Worker doesn't chunk itself
330
+ ram_fetch=self.ram_fetch,
331
+ parser_name=self.parser_name,
332
+ )
333
+
334
+ yield from worker
335
+ current = chunk_end + 1e-7
336
+
337
+ return
338
+
339
+ self._set_urls()
340
+ asyncio.run(self._prefetch_data())
341
+
342
+ rc_to_paths = self.paths["rib"]
343
+
344
+ # Agglomerate all RIBs parsers for ordering
345
+ iterators_to_order = []
346
+ for rc, paths in rc_to_paths.items():
347
+ # Don't use a generator here. parsers are lazy anyway
348
+ parsers = [
349
+ (
350
+ dt_from_filepath(path),
351
+ rc,
352
+ self.parser_cls(path, True, rc, filters=self.filters),
353
+ )
354
+ for path in paths
355
+ ]
356
+ iterators_to_order.extend(parsers)
357
+
358
+ iterators_to_order.sort(key=itemgetter(0, 1))
359
+
360
+ for bgpelem in chain.from_iterable(
361
+ (iterator[2] for iterator in iterators_to_order)
362
+ ):
363
+ if self.ts_start <= bgpelem.time <= self.ts_end:
364
+ yield bgpelem
270
365
  finally:
271
366
  self.cache_dir.cleanup()
272
367
 
273
368
  @classmethod
274
- def from_config(cls, config: BGPStreamConfig):
275
- return cls(
276
- ts_start=config.start_time.timestamp(),
277
- ts_end=config.end_time.timestamp(),
278
- collector_id=",".join(config.collectors),
279
- data_type=[
280
- dtype[:-1] for dtype in config.data_types
281
- ], # removes plural form
282
- cache_dir=str(config.cache_dir) if config.cache_dir else None,
283
- filters=config.filters.model_dump(exclude_unset=True)
284
- if config.filters
285
- else {},
286
- max_concurrent_downloads=config.max_concurrent_downloads
287
- if config.max_concurrent_downloads
288
- else 10,
289
- chunk_time=config.chunk_time.seconds if config.chunk_time else None,
290
- )
369
+ def from_config(cls, config: PyBGPKITStreamConfig | BGPStreamConfig):
370
+ if isinstance(config, PyBGPKITStreamConfig):
371
+ stream_config = config.bgpstream_config
372
+ return cls(
373
+ ts_start=stream_config.start_time.timestamp(),
374
+ ts_end=stream_config.end_time.timestamp(),
375
+ collector_id=",".join(stream_config.collectors),
376
+ data_type=[dtype[:-1] for dtype in stream_config.data_types],
377
+ filters=stream_config.filters
378
+ if stream_config.filters
379
+ else FilterOptions(),
380
+ cache_dir=str(config.cache_dir) if config.cache_dir else None,
381
+ max_concurrent_downloads=config.max_concurrent_downloads
382
+ if config.max_concurrent_downloads
383
+ else 10,
384
+ chunk_time=config.chunk_time.seconds if config.chunk_time else None,
385
+ ram_fetch=config.ram_fetch if config.ram_fetch else None,
386
+ parser_name=config.parser if config.parser else "pybgpkit",
387
+ )
388
+
389
+ elif isinstance(config, BGPStreamConfig):
390
+ return cls(
391
+ ts_start=config.start_time.timestamp(),
392
+ ts_end=config.end_time.timestamp(),
393
+ collector_id=",".join(config.collectors),
394
+ data_type=[dtype[:-1] for dtype in config.data_types],
395
+ filters=config.filters if config.filters else FilterOptions(),
396
+ )
@@ -0,0 +1,430 @@
1
+ import bgpkit
2
+ from pybgpkitstream.bgpstreamconfig import FilterOptions
3
+ from pybgpkitstream.bgpelement import BGPElement
4
+ from typing import Iterator, Protocol
5
+ import re
6
+ import ipaddress
7
+ import subprocess as sp
8
+ from pybgpkitstream.utils import dt_from_filepath
9
+
10
+ try:
11
+ import pybgpstream
12
+ except ImportError:
13
+ pass
14
+
15
+
16
+ class BGPParser(Protocol):
17
+ filepath: str
18
+ is_rib: bool
19
+ collector: str
20
+ filters: FilterOptions
21
+
22
+ def __iter__(self) -> Iterator[BGPElement]: ...
23
+
24
+
25
+ class PyBGPKITParser(BGPParser):
26
+ """Use BGPKIT Python bindings (default parser). Slower than other alternatives but easier to ship (no system dependencies)."""
27
+
28
+ def __init__(
29
+ self,
30
+ filepath: str,
31
+ is_rib: bool,
32
+ collector: str,
33
+ filters: FilterOptions = FilterOptions(),
34
+ ):
35
+ self.filepath = filepath
36
+ self.parser = None # placeholder for lazy instantiation
37
+ self.is_rib = is_rib
38
+ self.collector = collector
39
+ self.filters = filters.model_dump(exclude_unset=True)
40
+ # cast int ipv to pybgpkit ipv4 or ipv6 string
41
+ if "ip_version" in self.filters:
42
+ ipv_int = self.filters["ip_version"]
43
+ if ipv_int:
44
+ self.filters["ip_version"] = f"ipv{ipv_int}"
45
+
46
+ def _convert(self, element) -> BGPElement:
47
+ return BGPElement(
48
+ type="R" if self.is_rib else element.elem_type,
49
+ collector=self.collector,
50
+ time=element.timestamp,
51
+ peer_asn=element.peer_asn,
52
+ peer_address=element.peer_ip,
53
+ fields={
54
+ "next-hop": element.next_hop,
55
+ "as-path": element.as_path,
56
+ "communities": [] if not element.communities else element.communities,
57
+ "prefix": element.prefix,
58
+ },
59
+ )
60
+
61
+ def __iter__(self) -> Iterator[BGPElement]:
62
+ parser = bgpkit.Parser(self.filepath, filters=self.filters)
63
+ for elem in parser:
64
+ yield self._convert(elem)
65
+
66
+
67
+ class BGPKITParser(BGPParser):
68
+ """Run BGPKIT's CLI `bgpkit-parser` as a subprocess."""
69
+
70
+ def __init__(
71
+ self,
72
+ filepath: str,
73
+ is_rib: bool,
74
+ collector: str,
75
+ filters: FilterOptions | str | None = None,
76
+ ):
77
+ self.filepath = filepath
78
+ self.parser = None # placeholder for lazy instantiation
79
+ self.is_rib = is_rib
80
+ self.collector = collector
81
+ self.filters = filters
82
+
83
+ # Set timestamp for the same behavior as bgpdump default (timestamp match rib time, not last change)
84
+ self.time = int(dt_from_filepath(self.filepath).timestamp())
85
+
86
+ def __iter__(self):
87
+ cmd = build_bgpkit_cmd(self.filepath, self.filters)
88
+ self.parser = sp.Popen(cmd, stdout=sp.PIPE, text=True, bufsize=1)
89
+
90
+ stream = (self._convert(line) for line in self.parser.stdout)
91
+
92
+ try:
93
+ yield from stream
94
+ finally:
95
+ # Cleanup happens whether exhausted or abandoned
96
+ self.parser.stdout.close()
97
+ self.parser.terminate()
98
+ self.parser.wait() # Reap the zombie process
99
+
100
+ def _convert(self, element: str):
101
+ element = element.rstrip().split("|")
102
+ rec_type = element[0]
103
+
104
+ # 1. Handle Withdrawals (W)
105
+ # Structure: Type|Time|PeerIP|PeerAS|Prefix
106
+ if rec_type == "W":
107
+ return BGPElement(
108
+ type="W",
109
+ collector=self.collector,
110
+ time=self.time, # force RIB filename timestamp instead of last changed
111
+ peer_asn=int(element[3]),
112
+ peer_address=element[2],
113
+ fields={"prefix": element[4]},
114
+ )
115
+
116
+ # 2. Handle Announcements (A)
117
+ # Structure: Type|Time|PeerIP|PeerAS|Prefix|ASPath|Origin|NextHop|...|Communities|...
118
+ # bgpkit-parser index mapping:
119
+ # 0: Type, 1: Time, 2: PeerIP, 3: PeerAS, 4: Prefix,
120
+ # 5: ASPath, 7: NextHop, 10: Communities
121
+
122
+ rec_comm = element[10]
123
+
124
+ return BGPElement(
125
+ # bgpkit outputs 'A' for both Updates and RIB entries.
126
+ # Map to "A" (Announcement) or change to "R" if you strictly need RIB typing.
127
+ "R" if self.is_rib else rec_type,
128
+ self.collector,
129
+ # float(element[1]),
130
+ self.time,
131
+ int(element[3]),
132
+ element[2],
133
+ {
134
+ "prefix": element[4],
135
+ "as-path": element[5],
136
+ "next-hop": element[7],
137
+ # Fast check for empty communities
138
+ "communities": rec_comm.split() if rec_comm else [],
139
+ },
140
+ )
141
+
142
+
143
+ class PyBGPStreamParser(BGPParser):
144
+ """Use pybgpstream as a MRT parser with the `singlefile` data interface"""
145
+
146
+ def __init__(
147
+ self,
148
+ filepath: str,
149
+ is_rib: bool,
150
+ collector: str,
151
+ filters: FilterOptions,
152
+ *args,
153
+ **kwargs,
154
+ ):
155
+ self.filepath = filepath
156
+ self.collector = collector
157
+ self.filters = generate_bgpstream_filters(filters) if filters else None
158
+
159
+ def __iter__(self):
160
+ stream = pybgpstream.BGPStream(data_interface="singlefile", filter=self.filters)
161
+ stream.set_data_interface_option("singlefile", "rib-file", self.filepath)
162
+
163
+ for elem in stream:
164
+ elem.collector = self.collector
165
+ yield elem
166
+
167
+
168
+ class BGPdumpParser(BGPParser):
169
+ """Run bgpdump as a subprocess. I might have over-engineered the filtering."""
170
+
171
+ def __init__(self, filepath, is_rib, collector, filters):
172
+ self.filepath = filepath
173
+ self.collector = collector
174
+
175
+ self._init_filters(filters)
176
+
177
+ def __iter__(self):
178
+ self.parser = sp.Popen(
179
+ ["bgpdump", "-m", "-v", self.filepath], stdout=sp.PIPE, text=True, bufsize=1
180
+ )
181
+
182
+ try:
183
+ raw_stream = (self._convert(line) for line in self.parser.stdout)
184
+ # Filter STATE message
185
+ clean_stream = (e for e in raw_stream if e is not None)
186
+
187
+ if self._filter_func:
188
+ yield from filter(self._filter_func, clean_stream)
189
+ else:
190
+ yield from clean_stream
191
+ finally:
192
+ # Cleanup happens whether exhausted or abandoned
193
+ self.parser.stdout.close()
194
+ self.parser.terminate()
195
+ self.parser.wait() # Reap the zombie process
196
+
197
+ def _convert(self, element: str):
198
+ # Extract type once to avoid repeated list lookups
199
+ element = element.rstrip().split("|")
200
+ elem_type = element[2]
201
+ if elem_type == "STATE":
202
+ return
203
+
204
+ # 1. Handle Withdrawals (Fastest path, fewer fields)
205
+ if elem_type == "W":
206
+ return BGPElement(
207
+ "W",
208
+ self.collector,
209
+ float(element[1]),
210
+ int(element[4]),
211
+ element[3],
212
+ {"prefix": element[5]}, # Dict literal is faster than assignment
213
+ )
214
+
215
+ # 2. Handle RIB (TABLE_DUMP2) and Announcements (A)
216
+ # Common vars
217
+ rec_comm = element[11]
218
+
219
+ # Logic: if TABLE_DUMP2, type is R, else A
220
+ # Construct fields dict in one shot (BUILD_MAP opcode)
221
+ return BGPElement(
222
+ "R" if elem_type == "B" else "A",
223
+ self.collector,
224
+ float(element[1]),
225
+ int(element[4]),
226
+ element[3],
227
+ {
228
+ "prefix": element[5],
229
+ "as-path": element[6],
230
+ "next-hop": element[8],
231
+ # Check for empty string before splitting (avoids creating [''])
232
+ "communities": rec_comm.split() if rec_comm else [],
233
+ },
234
+ )
235
+
236
+ def _init_filters(self, f: FilterOptions):
237
+ # 1. Pre-process sets for O(1) lookups and compile Regex
238
+ # self.peer_asns = set([f.peer_asn]) if f.peer_asn else (set(f.peer_ips) if f.peer_ips else None)
239
+ if not f.model_dump(exclude_unset=True):
240
+ self._filter_func = None
241
+
242
+ self.peer_asn = f.peer_asn
243
+
244
+ # Peer IPs (handles both single and list)
245
+ self.peer_ips = None
246
+ if f.peer_ip:
247
+ self.peer_ips = {str(f.peer_ip)}
248
+ elif f.peer_ips:
249
+ self.peer_ips = {str(ip) for ip in f.peer_ips}
250
+
251
+ self.origin_asn = str(f.origin_asn) if f.origin_asn else None
252
+ self.update_type = (
253
+ f.update_type[0].upper() if f.update_type else None
254
+ ) # 'A' or 'W'
255
+ self.ip_version = f.ip_version
256
+
257
+ # Regex and CIDR objects
258
+ self.as_path_re = re.compile(f.as_path) if f.as_path else None
259
+ self.exact_net = ipaddress.ip_network(f.prefix) if f.prefix else None
260
+ self.sub_net = ipaddress.ip_network(f.prefix_sub) if f.prefix_sub else None
261
+ self.super_net = (
262
+ ipaddress.ip_network(f.prefix_super) if f.prefix_super else None
263
+ )
264
+ self.ss_net = (
265
+ ipaddress.ip_network(f.prefix_super_sub) if f.prefix_super_sub else None
266
+ )
267
+
268
+ # 2. Build the optimized filter function
269
+ self._filter_func = self._compile_filter()
270
+
271
+ def _compile_filter(self):
272
+ # Localize variables to the closure to avoid 'self' lookups in the loop
273
+ p_asn = self.peer_asn
274
+ p_ips = self.peer_ips
275
+ o_asn = self.origin_asn
276
+ u_type = self.update_type
277
+ version = self.ip_version
278
+ path_re = self.as_path_re
279
+
280
+ e_net = self.exact_net
281
+ sub_n = self.sub_net
282
+ sup_n = self.super_net
283
+ ss_n = self.ss_net
284
+
285
+ def filter_logic(e: BGPElement) -> bool:
286
+ # 1. Cheap checks first (Integers and Strings)
287
+ if p_asn is not None and int(e.peer_asn) != p_asn:
288
+ return False
289
+ if p_ips is not None and e.peer_address not in p_ips:
290
+ return False
291
+ if u_type is not None and e.type != u_type:
292
+ return False
293
+
294
+ # 2. String processing (Origin ASN and AS Path)
295
+ # Use .get() or direct access depending on your confidence in 'fields' content
296
+ as_path = e.fields.get("as-path", "")
297
+ if o_asn is not None:
298
+ if not as_path or as_path.rsplit(" ", 1)[-1] != o_asn:
299
+ return False
300
+ if path_re is not None and not path_re.search(as_path):
301
+ return False
302
+
303
+ # 3. CIDR / IP Logic (Expensive)
304
+ prefix_str = e.fields.get("prefix")
305
+ if version is not None:
306
+ # Fast check for IP version without parsing
307
+ is_v6 = ":" in prefix_str if prefix_str else False
308
+ if (version == 6 and not is_v6) or (version == 4 and is_v6):
309
+ return False
310
+
311
+ if e_net or sub_n or sup_n or ss_n:
312
+ if not prefix_str:
313
+ return False
314
+ net = ipaddress.ip_network(prefix_str)
315
+ if e_net and net != e_net:
316
+ return False
317
+ if sub_n and not net.subnet_of(sub_n):
318
+ return False
319
+ if sup_n and not net.supernet_of(sup_n):
320
+ return False
321
+ if ss_n and not (net.subnet_of(ss_n) or net.supernet_of(ss_n)):
322
+ return False
323
+
324
+ return True
325
+
326
+ return filter_logic
327
+
328
+
329
+ def generate_bgpstream_filters(f: FilterOptions) -> str | None:
330
+ """Generates a filter string compatible with BGPStream's C parser from a BGPStreamConfig object."""
331
+ if not f:
332
+ return None
333
+ if not f.model_dump(exclude_unset=True):
334
+ return None
335
+
336
+ parts = []
337
+
338
+ if f.peer_asn:
339
+ parts.append(f"peer {f.peer_asn}")
340
+
341
+ if f.as_path:
342
+ # Quote the value to handle potential spaces in the regex
343
+ parts.append(f'aspath "{f.as_path}"')
344
+
345
+ if f.origin_asn:
346
+ # Filtering by origin ASN is typically done via an AS path regex
347
+ parts.append(f'aspath "_{f.origin_asn}$"')
348
+
349
+ if f.update_type:
350
+ # The parser expects 'announcements' or 'withdrawals'
351
+ value = "announcements" if f.update_type == "announce" else "withdrawals"
352
+ parts.append(f"elemtype {value}")
353
+
354
+ # Handle all prefix variations
355
+ if f.prefix:
356
+ parts.append(f"prefix exact {f.prefix}")
357
+ if f.prefix_super:
358
+ parts.append(f"prefix less {f.prefix_super}")
359
+ if f.prefix_sub:
360
+ parts.append(f"prefix more {f.prefix_sub}")
361
+ if f.prefix_super_sub:
362
+ parts.append(f"prefix any {f.prefix_super_sub}")
363
+
364
+ if f.ip_version:
365
+ parts.append(f"ipversion {f.ip_version[-1]}")
366
+
367
+ # Warn about unsupported fields
368
+ if f.peer_ip or f.peer_ips:
369
+ print(
370
+ "Warning: peer_ip and peer_ips are not supported by this BGPStream filter string parser and will be ignored."
371
+ )
372
+
373
+ # Join all parts with 'and' as required by the parser
374
+ return " and ".join(parts)
375
+
376
+
377
+ def build_bgpkit_cmd(filepath: str, filters: FilterOptions) -> list[str]:
378
+ # Start with the base command and file path
379
+ cmd = ["bgpkit-parser", filepath]
380
+
381
+ # 1. Simple Integer/String Mappings
382
+ if filters.origin_asn:
383
+ cmd.extend(["--origin-asn", str(filters.origin_asn)])
384
+
385
+ if filters.peer_ip:
386
+ cmd.extend(["--peer-ip", str(filters.peer_ip)])
387
+
388
+ if filters.peer_asn:
389
+ cmd.extend(["--peer-asn", str(filters.peer_asn)])
390
+
391
+ if filters.as_path:
392
+ cmd.extend(["--as-path", filters.as_path])
393
+
394
+ # 2. Prefix Logic (Handling super/sub flags)
395
+ # We prioritize the most specific prefix field provided
396
+ prefix_val = None
397
+ if filters.prefix:
398
+ prefix_val = filters.prefix
399
+ elif filters.prefix_super:
400
+ prefix_val = filters.prefix_super
401
+ cmd.append("--include-super")
402
+ elif filters.prefix_sub:
403
+ prefix_val = filters.prefix_sub
404
+ cmd.append("--include-sub")
405
+ elif filters.prefix_super_sub:
406
+ prefix_val = filters.prefix_super_sub
407
+ cmd.extend(["--include-super", "--include-sub"])
408
+
409
+ if prefix_val:
410
+ cmd.extend(["--prefix", prefix_val])
411
+
412
+ # 3. List-based filters (using the --filter "key=value" format)
413
+ if filters.peer_ips:
414
+ # If it's a list, we add a generic filter for the comma-separated string
415
+ ips_str = ",".join(str(ip) for ip in filters.peer_ips)
416
+ cmd.extend(["--filter", f"peer_ips={ips_str}"])
417
+
418
+ # 4. Enums and Literals
419
+ if filters.update_type:
420
+ # CLI accepts 'a' for announce and 'w' for withdraw
421
+ val = "a" if filters.update_type == "announce" else "w"
422
+ cmd.extend(["--elem-type", val])
423
+
424
+ if filters.ip_version:
425
+ if filters.ip_version == 4:
426
+ cmd.append("--ipv4-only")
427
+ elif filters.ip_version == 6:
428
+ cmd.append("--ipv6-only")
429
+
430
+ return cmd
@@ -1,4 +1,6 @@
1
1
  import datetime
2
+ import importlib
3
+ import shutil
2
4
  from pydantic import BaseModel, Field, DirectoryPath, field_validator
3
5
  from typing import Literal
4
6
  from ipaddress import IPv4Address, IPv6Address
@@ -31,7 +33,7 @@ class FilterOptions(BaseModel):
31
33
  peer_ips: list[str | IPv4Address | IPv6Address] | None = Field(
32
34
  default=None, description="Filter by a list of BGP peer IP addresses."
33
35
  )
34
- peer_asn: str | None = Field(
36
+ peer_asn: int | None = Field(
35
37
  default=None, description="Filter by the AS number of the BGP peer."
36
38
  )
37
39
  update_type: Literal["withdraw", "announce"] | None = Field(
@@ -40,17 +42,13 @@ class FilterOptions(BaseModel):
40
42
  as_path: str | None = Field(
41
43
  default=None, description="Filter by a regular expression matching the AS path."
42
44
  )
43
- ip_version: Literal["ipv4", "ipv6"] | None = Field(
45
+ ip_version: Literal[4, 6] | None = Field(
44
46
  default=None, description="Filter by ip version."
45
47
  )
46
48
 
47
49
 
48
50
  class BGPStreamConfig(BaseModel):
49
- """
50
- Unified BGPStream config.
51
-
52
- Filters are primarily written for BGPKit but utils to convert to pybgpstream are provided in tests/pybgpstream_utils.
53
- """
51
+ """Unified BGPStream config, compatible with BGPKIT and pybgpstream"""
54
52
 
55
53
  start_time: datetime.datetime = Field(description="Start of the stream")
56
54
  end_time: datetime.datetime = Field(description="End of the stream")
@@ -58,18 +56,8 @@ class BGPStreamConfig(BaseModel):
58
56
  data_types: list[Literal["ribs", "updates"]] = Field(
59
57
  description="List of archives files to consider (`ribs` or `updates`)"
60
58
  )
61
- cache_dir: DirectoryPath | None = Field(
62
- default=None,
63
- description="Specifies the directory for caching downloaded files.",
64
- )
59
+
65
60
  filters: FilterOptions | None = Field(default=None, description="Optional filters")
66
- max_concurrent_downloads: int | None = Field(
67
- default=None, description="Maximum concurrent downloads when caching"
68
- )
69
- chunk_time: datetime.timedelta | None = Field(
70
- default=datetime.timedelta(hours=2),
71
- description="Interval for the fetch/parse cycle (avoid long prefetch time)",
72
- )
73
61
 
74
62
  @field_validator("start_time", "end_time")
75
63
  @classmethod
@@ -80,3 +68,75 @@ class BGPStreamConfig(BaseModel):
80
68
  # if timezone-aware, convert to utc
81
69
  else:
82
70
  return dt.astimezone(datetime.timezone.utc)
71
+
72
+
73
+ class PyBGPKITStreamConfig(BaseModel):
74
+ """Unified BGPStream config and parameters related to PyBGPKIT implementation (all optional)"""
75
+
76
+ bgpstream_config: BGPStreamConfig
77
+
78
+ max_concurrent_downloads: int | None = Field(
79
+ default=10, description="Maximum concurrent downloads of archive files."
80
+ )
81
+
82
+ cache_dir: DirectoryPath | None = Field(
83
+ default=None,
84
+ description="Specifies the directory for caching downloaded files.",
85
+ )
86
+
87
+ ram_fetch: bool | None = Field(
88
+ default=True,
89
+ description=(
90
+ "If caching is disabled, fetch temp files in shared RAM memory (/dev/shml) or normal disc temp dir (/tmp)."
91
+ "Default (True) improve perfomance and reduce disk wear, at the expense of increased RAM usage."
92
+ ),
93
+ )
94
+
95
+ chunk_time: datetime.timedelta | None = Field(
96
+ default=datetime.timedelta(hours=2),
97
+ description=(
98
+ "Interval for the fetch/parse cycles (benefits: avoid long prefetch time + periodic temps cleanup when caching is disabled)."
99
+ "Slower value means less RAM/disk used at the cost of performance."
100
+ ),
101
+ )
102
+
103
+ parser: Literal["pybgpkit", "bgpkit", "pybgpstream", "bgpdump"] = Field(
104
+ default="pybgpkit",
105
+ description=(
106
+ "MRT files parser. Default `pybgpkit` is installed but slow, the others are system dependencies."
107
+ ),
108
+ )
109
+
110
+ @field_validator("parser")
111
+ @classmethod
112
+ def check_parser_available(cls, parser: str) -> str:
113
+ if parser == "pybgpkit":
114
+ if importlib.util.find_spec("bgpkit") is None:
115
+ raise ValueError(
116
+ "pybgpkit is not installed. Install with: pip install pybgpkit"
117
+ )
118
+
119
+ elif parser == "pybgpstream":
120
+ if importlib.util.find_spec("pybgpstream") is None:
121
+ raise ValueError(
122
+ "pybgpstream is not installed. "
123
+ "Install with: pip install pybgpstream (ensure system dependencies are met)"
124
+ )
125
+
126
+ elif parser == "bgpdump":
127
+ if shutil.which("bgpdump") is None:
128
+ raise ValueError(
129
+ "bgpdump binary not found in PATH. "
130
+ "Install with: sudo apt-get install bgpdump"
131
+ )
132
+
133
+ elif parser == "bgpkit":
134
+ if shutil.which("bgpkit") is None:
135
+ raise ValueError(
136
+ "bgpkit binary not found in PATH. "
137
+ "Install from: https://github.com/bgpkit/bgpkit-parser "
138
+ "or use cargo: cargo install bgpkit-parser"
139
+ )
140
+
141
+ # Return the parser value if validation passes
142
+ return parser
@@ -0,0 +1,11 @@
1
+ import datetime
2
+ import re
3
+
4
+ def dt_from_filepath(filepath: str, pattern=r"(\d{8}\.\d{4})") -> datetime.datetime:
5
+ match = re.search(pattern, filepath)
6
+ if not match:
7
+ raise RuntimeError("Could not determine time from filepath")
8
+ timestamp_str = match.group(1)
9
+ dt = datetime.datetime.strptime(timestamp_str, "%Y%m%d.%H%M")
10
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
11
+ return dt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pybgpkitstream
3
- Version: 0.1.6
3
+ Version: 0.1.9
4
4
  Summary: Drop-in replacement for PyBGPStream using BGPKIT
5
5
  Author: JustinLoye
6
6
  Author-email: JustinLoye <jloye@iij.ad.jp>
@@ -0,0 +1,12 @@
1
+ pybgpkitstream/__init__.py,sha256=OGWVhZdSvialNkIkQ1VBrmiyOcwkCA1D5IaLo7WQnPI,209
2
+ pybgpkitstream/bgpelement.py,sha256=7mXSUmWThhIbKy2JVsLchoteve0BshT3uH8cdbAe0Go,1176
3
+ pybgpkitstream/bgpkitstream.py,sha256=CKQv7dU-ooznuD1AjHKnZ6qRdPH1ZiOIEGtVNtU8PCY,15062
4
+ pybgpkitstream/bgpparser.py,sha256=aJcVCv_ydy3xQcH_BBxQE4hc7G1rLYqqNJAXCdnrasA,14689
5
+ pybgpkitstream/bgpstreamconfig.py,sha256=DWnQkvmmpzuvU0RT6Ko4AokA795_HIi5xwpUhANsUN4,5465
6
+ pybgpkitstream/cli.py,sha256=E0E1hO0fzGhy1skBopRufdewsiSy_mA-J8Gf2WxBRxo,4214
7
+ pybgpkitstream/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ pybgpkitstream/utils.py,sha256=6FwEEpBtY_20BDlJPOPFmTYQGqw7fCBLjXmnd7gjBdQ,404
9
+ pybgpkitstream-0.1.9.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
10
+ pybgpkitstream-0.1.9.dist-info/entry_points.txt,sha256=aWhImGlXLtRKkfyJHudcbSp5K5As4ZGL8wXZC0y6q4o,60
11
+ pybgpkitstream-0.1.9.dist-info/METADATA,sha256=CEiHubGgTr9Ef_L4e2fqh0pPo1F9UtIHBosKcXUoWe4,2953
12
+ pybgpkitstream-0.1.9.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- pybgpkitstream/__init__.py,sha256=kNfv6bvDkaGKjlw0pr9LWVqOQtIGmIPk-VG1ZCBuA38,163
2
- pybgpkitstream/bgpelement.py,sha256=7mXSUmWThhIbKy2JVsLchoteve0BshT3uH8cdbAe0Go,1176
3
- pybgpkitstream/bgpkitstream.py,sha256=VU79Olu1wk_GNBMxsCKCMjMfSU5YoKA9WsCf-4sOMBk,10997
4
- pybgpkitstream/bgpstreamconfig.py,sha256=tFZEpqOoFMMNbyf6dzKHEtHZnEnm1Gv88hKy6BqTCq8,3299
5
- pybgpkitstream/cli.py,sha256=E0E1hO0fzGhy1skBopRufdewsiSy_mA-J8Gf2WxBRxo,4214
6
- pybgpkitstream/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- pybgpkitstream-0.1.6.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
- pybgpkitstream-0.1.6.dist-info/entry_points.txt,sha256=aWhImGlXLtRKkfyJHudcbSp5K5As4ZGL8wXZC0y6q4o,60
9
- pybgpkitstream-0.1.6.dist-info/METADATA,sha256=yKcQtzIS0JFXppGsnMVnQRlhVp82_u6Es8Q1fgPLFqQ,2953
10
- pybgpkitstream-0.1.6.dist-info/RECORD,,