pybgpkitstream 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pybgpkitstream/bgpkitstream.py +87 -72
- pybgpkitstream/bgpstreamconfig.py +11 -1
- {pybgpkitstream-0.1.3.dist-info → pybgpkitstream-0.1.5.dist-info}/METADATA +1 -1
- pybgpkitstream-0.1.5.dist-info/RECORD +10 -0
- pybgpkitstream-0.1.3.dist-info/RECORD +0 -10
- {pybgpkitstream-0.1.3.dist-info → pybgpkitstream-0.1.5.dist-info}/WHEEL +0 -0
- {pybgpkitstream-0.1.3.dist-info → pybgpkitstream-0.1.5.dist-info}/entry_points.txt +0 -0
pybgpkitstream/bgpkitstream.py
CHANGED
|
@@ -9,6 +9,7 @@ from heapq import merge
|
|
|
9
9
|
from operator import itemgetter
|
|
10
10
|
import binascii
|
|
11
11
|
import logging
|
|
12
|
+
from tempfile import TemporaryDirectory
|
|
12
13
|
|
|
13
14
|
import aiohttp
|
|
14
15
|
import bgpkit
|
|
@@ -44,6 +45,17 @@ def crc32(input_str: str):
|
|
|
44
45
|
return f"{crc:08x}"
|
|
45
46
|
|
|
46
47
|
|
|
48
|
+
class Directory:
|
|
49
|
+
"""Permanent directory that mimics TemporaryDirectory interface."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, path):
|
|
52
|
+
self.name = str(path)
|
|
53
|
+
|
|
54
|
+
def cleanup(self):
|
|
55
|
+
"""No-op cleanup for permanent directories."""
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
47
59
|
class BGPKITStream:
|
|
48
60
|
def __init__(
|
|
49
61
|
self,
|
|
@@ -60,7 +72,9 @@ class BGPKITStream:
|
|
|
60
72
|
self.ts_end = ts_end
|
|
61
73
|
self.collector_id = collector_id
|
|
62
74
|
self.data_type = data_type
|
|
63
|
-
self.cache_dir =
|
|
75
|
+
self.cache_dir: Directory | TemporaryDirectory = (
|
|
76
|
+
Directory(cache_dir) if cache_dir else TemporaryDirectory()
|
|
77
|
+
)
|
|
64
78
|
self.filters = filters
|
|
65
79
|
self.max_concurrent_downloads = max_concurrent_downloads
|
|
66
80
|
self.chunk_time = chunk_time
|
|
@@ -72,7 +86,6 @@ class BGPKITStream:
|
|
|
72
86
|
"""Generate a cache filename compatible with BGPKIT parser."""
|
|
73
87
|
|
|
74
88
|
hash_suffix = crc32(url)
|
|
75
|
-
print(url)
|
|
76
89
|
|
|
77
90
|
if "updates." in url:
|
|
78
91
|
data_type = "updates"
|
|
@@ -103,9 +116,8 @@ class BGPKITStream:
|
|
|
103
116
|
self.urls = {"rib": defaultdict(list), "update": defaultdict(list)}
|
|
104
117
|
for data_type in self.data_type:
|
|
105
118
|
items: list[BrokerItem] = self.broker.query(
|
|
106
|
-
ts_start=
|
|
107
|
-
|
|
108
|
-
ts_end=datetime.datetime.fromtimestamp(self.ts_end),
|
|
119
|
+
ts_start=int(self.ts_start - 60),
|
|
120
|
+
ts_end=int(self.ts_end),
|
|
109
121
|
collector_id=self.collector_id,
|
|
110
122
|
data_type=data_type,
|
|
111
123
|
)
|
|
@@ -142,7 +154,7 @@ class BGPKITStream:
|
|
|
142
154
|
for rc, rc_urls in self.urls[data_type].items():
|
|
143
155
|
for url in rc_urls:
|
|
144
156
|
filename = self._generate_cache_filename(url)
|
|
145
|
-
filepath = os.path.join(self.cache_dir, filename)
|
|
157
|
+
filepath = os.path.join(self.cache_dir.name, filename)
|
|
146
158
|
|
|
147
159
|
if os.path.exists(filepath):
|
|
148
160
|
logging.debug(f"{filepath} is a cache hit")
|
|
@@ -173,76 +185,79 @@ class BGPKITStream:
|
|
|
173
185
|
return ((elem.timestamp, elem, is_rib, collector) for elem in iterator)
|
|
174
186
|
|
|
175
187
|
def __iter__(self) -> Iterator[BGPElement]:
|
|
176
|
-
#
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
188
|
+
# try/finally to cleanup the fetching cache
|
|
189
|
+
try:
|
|
190
|
+
# Manager mode: spawn smaller worker streams to balance fetch/parse
|
|
191
|
+
if self.chunk_time:
|
|
192
|
+
current = self.ts_start
|
|
193
|
+
|
|
194
|
+
while current < self.ts_end:
|
|
195
|
+
chunk_end = min(current + self.chunk_time, self.ts_end)
|
|
196
|
+
|
|
197
|
+
logging.info(
|
|
198
|
+
f"Processing chunk: {datetime.datetime.fromtimestamp(current)} "
|
|
199
|
+
f"to {datetime.datetime.fromtimestamp(chunk_end)}"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
worker = type(self)(
|
|
203
|
+
ts_start=current,
|
|
204
|
+
ts_end=chunk_end
|
|
205
|
+
- 1, # remove one second because BGPKIT include border
|
|
206
|
+
collector_id=self.collector_id,
|
|
207
|
+
data_type=self.data_type,
|
|
208
|
+
cache_dir=None,
|
|
209
|
+
filters=self.filters,
|
|
210
|
+
max_concurrent_downloads=self.max_concurrent_downloads,
|
|
211
|
+
chunk_time=None, # Worker doesn't chunk itself
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
yield from worker
|
|
215
|
+
current = chunk_end + 1e-7
|
|
216
|
+
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
self._set_urls()
|
|
202
220
|
|
|
203
|
-
return
|
|
204
|
-
|
|
205
|
-
self._set_urls()
|
|
206
|
-
|
|
207
|
-
if self.cache_dir:
|
|
208
221
|
asyncio.run(self._prefetch_data())
|
|
209
222
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
223
|
+
# One iterator for each data_type * collector combinations
|
|
224
|
+
# To be merged according to the elements timestamp
|
|
225
|
+
iterators_to_merge = []
|
|
213
226
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
227
|
+
for data_type in self.data_type:
|
|
228
|
+
is_rib = data_type == "rib"
|
|
229
|
+
|
|
230
|
+
# Get rib or update files per collector
|
|
231
|
+
rc_to_paths = self.paths[data_type]
|
|
232
|
+
|
|
233
|
+
# Chain rib or update iterators to get one stream per collector / data_type
|
|
234
|
+
for rc, paths in rc_to_paths.items():
|
|
235
|
+
parsers = [
|
|
236
|
+
bgpkit.Parser(url=path, filters=self.filters) for path in paths
|
|
237
|
+
]
|
|
238
|
+
|
|
239
|
+
chained_iterator = chain.from_iterable(parsers)
|
|
240
|
+
|
|
241
|
+
# Add metadata lost by bgpkit for compatibility with pubgpstream
|
|
242
|
+
iterators_to_merge.append((chained_iterator, is_rib, rc))
|
|
243
|
+
|
|
244
|
+
# Make a generator to tag each bgpkit element with metadata
|
|
245
|
+
# Benefit 1: full compat with pybgpstream
|
|
246
|
+
# Benefit 2: we give a key easy to access for heapq to merge
|
|
247
|
+
tagged_iterators = [
|
|
248
|
+
self._create_tagged_iterator(it, is_rib, rc)
|
|
249
|
+
for it, is_rib, rc in iterators_to_merge
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
# Merge and convert to pybgpstream format
|
|
253
|
+
for timestamp, bgpkit_elem, is_rib, rc in merge(
|
|
254
|
+
*tagged_iterators, key=itemgetter(0)
|
|
255
|
+
):
|
|
256
|
+
if self.ts_start <= timestamp <= self.ts_end:
|
|
257
|
+
yield convert_bgpkit_elem(bgpkit_elem, is_rib, rc)
|
|
258
|
+
|
|
259
|
+
finally:
|
|
260
|
+
self.cache_dir.cleanup()
|
|
246
261
|
|
|
247
262
|
@classmethod
|
|
248
263
|
def from_config(cls, config: BGPStreamConfig):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from pydantic import BaseModel, Field, DirectoryPath
|
|
2
|
+
from pydantic import BaseModel, Field, DirectoryPath, field_validator
|
|
3
3
|
from typing import Literal
|
|
4
4
|
from ipaddress import IPv4Address, IPv6Address
|
|
5
5
|
|
|
@@ -70,3 +70,13 @@ class BGPStreamConfig(BaseModel):
|
|
|
70
70
|
default=datetime.timedelta(hours=2),
|
|
71
71
|
description="Interval for the fetch/parse cycle (avoid long prefetch time)",
|
|
72
72
|
)
|
|
73
|
+
|
|
74
|
+
@field_validator("start_time", "end_time")
|
|
75
|
+
@classmethod
|
|
76
|
+
def normalize_to_utc(cls, dt: datetime.datetime) -> datetime.datetime:
|
|
77
|
+
# if naive datetime (not timezone-aware) assume it's UTC
|
|
78
|
+
if dt.tzinfo is None:
|
|
79
|
+
return dt.replace(tzinfo=datetime.timezone.utc)
|
|
80
|
+
# if timezone-aware, convert to utc
|
|
81
|
+
else:
|
|
82
|
+
return dt.astimezone(datetime.timezone.utc)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
pybgpkitstream/__init__.py,sha256=kNfv6bvDkaGKjlw0pr9LWVqOQtIGmIPk-VG1ZCBuA38,163
|
|
2
|
+
pybgpkitstream/bgpelement.py,sha256=7mXSUmWThhIbKy2JVsLchoteve0BshT3uH8cdbAe0Go,1176
|
|
3
|
+
pybgpkitstream/bgpkitstream.py,sha256=kaoGAx282FfE5I3_sHXN1Zhzkpr7y94ffI5ZePVydog,10609
|
|
4
|
+
pybgpkitstream/bgpstreamconfig.py,sha256=tFZEpqOoFMMNbyf6dzKHEtHZnEnm1Gv88hKy6BqTCq8,3299
|
|
5
|
+
pybgpkitstream/cli.py,sha256=E0E1hO0fzGhy1skBopRufdewsiSy_mA-J8Gf2WxBRxo,4214
|
|
6
|
+
pybgpkitstream/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
pybgpkitstream-0.1.5.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
+
pybgpkitstream-0.1.5.dist-info/entry_points.txt,sha256=aWhImGlXLtRKkfyJHudcbSp5K5As4ZGL8wXZC0y6q4o,60
|
|
9
|
+
pybgpkitstream-0.1.5.dist-info/METADATA,sha256=b_tSeDSOKUN7V2u0ooWWIxnHN_pQS4CXvcC9JwrJb_g,2953
|
|
10
|
+
pybgpkitstream-0.1.5.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
pybgpkitstream/__init__.py,sha256=kNfv6bvDkaGKjlw0pr9LWVqOQtIGmIPk-VG1ZCBuA38,163
|
|
2
|
-
pybgpkitstream/bgpelement.py,sha256=7mXSUmWThhIbKy2JVsLchoteve0BshT3uH8cdbAe0Go,1176
|
|
3
|
-
pybgpkitstream/bgpkitstream.py,sha256=wJs6c1hs3oZJuPEg4l06MfU8Kz47NmhPngprp3DQ2ws,10100
|
|
4
|
-
pybgpkitstream/bgpstreamconfig.py,sha256=_PHoNhq8lw4QzNKya-KQFQ24dEbTjTkmefFhx0t6K8Q,2873
|
|
5
|
-
pybgpkitstream/cli.py,sha256=E0E1hO0fzGhy1skBopRufdewsiSy_mA-J8Gf2WxBRxo,4214
|
|
6
|
-
pybgpkitstream/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
pybgpkitstream-0.1.3.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
-
pybgpkitstream-0.1.3.dist-info/entry_points.txt,sha256=aWhImGlXLtRKkfyJHudcbSp5K5As4ZGL8wXZC0y6q4o,60
|
|
9
|
-
pybgpkitstream-0.1.3.dist-info/METADATA,sha256=M-5bvt3zjpA3IaoqFfiIzdWGgMEfHo16J4u730SRu8Q,2953
|
|
10
|
-
pybgpkitstream-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|