pcapml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pcapml/__init__.py ADDED
@@ -0,0 +1,46 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """pcapml: read pcapml-labeled network traces into pandas DataFrames.
3
+
4
+ Quick start
5
+ -----------
6
+ >>> import pcapml
7
+ >>> df = pcapml.read_pcapml("dataset.pcapng")
8
+ >>> df[["timestamp", "sample_id", "label", "src_ip", "dst_ip", "proto"]].head()
9
+
10
+ Group by sample:
11
+ >>> for sample in pcapml.samples("dataset.pcapng"):
12
+ ... print(sample.sample_id, sample.label, len(sample))
13
+ """
14
+
15
+ from importlib.metadata import PackageNotFoundError, version
16
+
17
+ from ._pcapng import Interface, Packet, iter_packets
18
+ from .reader import (
19
+ COLUMNS,
20
+ Sample,
21
+ parse_comment,
22
+ read,
23
+ read_pcapml,
24
+ sampler,
25
+ samples,
26
+ )
27
+
28
+ try:
29
+ # Single source of truth: the version declared in pyproject.toml.
30
+ __version__ = version("pcapml")
31
+ except PackageNotFoundError: # running from a source tree that isn't installed
32
+ __version__ = "0.0.0+unknown"
33
+
34
+ __all__ = [
35
+ "read_pcapml",
36
+ "read",
37
+ "samples",
38
+ "sampler",
39
+ "Sample",
40
+ "parse_comment",
41
+ "iter_packets",
42
+ "Packet",
43
+ "Interface",
44
+ "COLUMNS",
45
+ "__version__",
46
+ ]
pcapml/_decode.py ADDED
@@ -0,0 +1,156 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Minimal pure-Python L3/L4 header decoding.
3
+
4
+ Just enough to populate the convenience columns (addresses, protocol, ports).
5
+ The full packet bytes are always preserved in the DataFrame for anyone who
6
+ wants a real dissector (dpkt, scapy, ...).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import socket
12
+ import struct
13
+ from typing import Dict, Optional, Tuple
14
+
15
+ # Selected link types (https://www.tcpdump.org/linktypes.html).
16
+ LINKTYPE_ETHERNET = 1
17
+ LINKTYPE_RAW = 101
18
+ LINKTYPE_LINUX_SLL = 113
19
+ # Some tools emit 12/14 for "raw IP"; treat them as raw too.
20
+ _RAW_IP_LINKTYPES = frozenset({LINKTYPE_RAW, 12, 14})
21
+
22
+ ETHERTYPE_IPV4 = 0x0800
23
+ ETHERTYPE_IPV6 = 0x86DD
24
+ _VLAN_ETHERTYPES = frozenset({0x8100, 0x88A8, 0x9100})
25
+
26
+ IP_PROTO_NAMES: Dict[int, str] = {
27
+ 1: "ICMP",
28
+ 2: "IGMP",
29
+ 6: "TCP",
30
+ 17: "UDP",
31
+ 41: "IPv6",
32
+ 47: "GRE",
33
+ 50: "ESP",
34
+ 51: "AH",
35
+ 58: "IPv6-ICMP",
36
+ 89: "OSPF",
37
+ 132: "SCTP",
38
+ }
39
+
40
+ # IPv6 extension headers we skip over to reach the transport header.
41
+ _IPV6_EXT_HEADERS = frozenset({0, 43, 60}) # hop-by-hop, routing, dest-options
42
+
43
+ _EMPTY: Dict[str, Optional[object]] = {
44
+ "src_ip": None,
45
+ "dst_ip": None,
46
+ "proto": None,
47
+ "src_port": None,
48
+ "dst_port": None,
49
+ }
50
+
51
+
52
+ def decode(data: bytes, link_type: int) -> Dict[str, Optional[object]]:
53
+ """Decode L3/L4 fields from a raw packet. Never raises; unknowns are None."""
54
+ try:
55
+ payload, ethertype = _strip_l2(data, link_type)
56
+ if payload is None:
57
+ return dict(_EMPTY)
58
+ if ethertype == ETHERTYPE_IPV4:
59
+ return _decode_ipv4(payload)
60
+ if ethertype == ETHERTYPE_IPV6:
61
+ return _decode_ipv6(payload)
62
+ except Exception:
63
+ pass
64
+ return dict(_EMPTY)
65
+
66
+
67
+ def _strip_l2(data: bytes, link_type: int) -> Tuple[Optional[bytes], Optional[int]]:
68
+ """Return (l3_payload, ethertype), stripping any link-layer header."""
69
+ if link_type == LINKTYPE_ETHERNET:
70
+ if len(data) < 14:
71
+ return None, None
72
+ ethertype = struct.unpack_from("!H", data, 12)[0]
73
+ off = 14
74
+ while ethertype in _VLAN_ETHERTYPES and len(data) >= off + 4:
75
+ ethertype = struct.unpack_from("!H", data, off + 2)[0]
76
+ off += 4
77
+ return data[off:], ethertype
78
+
79
+ if link_type == LINKTYPE_LINUX_SLL:
80
+ if len(data) < 16:
81
+ return None, None
82
+ return data[16:], struct.unpack_from("!H", data, 14)[0]
83
+
84
+ if link_type in _RAW_IP_LINKTYPES or link_type == 0:
85
+ return _raw_ip(data)
86
+
87
+ return None, None
88
+
89
+
90
+ def _raw_ip(data: bytes) -> Tuple[Optional[bytes], Optional[int]]:
91
+ """Infer IPv4 vs IPv6 from the version nibble of a raw IP packet."""
92
+ if not data:
93
+ return None, None
94
+ version = data[0] >> 4
95
+ if version == 4:
96
+ return data, ETHERTYPE_IPV4
97
+ if version == 6:
98
+ return data, ETHERTYPE_IPV6
99
+ return None, None
100
+
101
+
102
+ def _ip_str(raw: bytes, family: int) -> Optional[str]:
103
+ try:
104
+ return socket.inet_ntop(family, raw)
105
+ except (OSError, ValueError):
106
+ return None
107
+
108
+
109
+ def _ports(payload: bytes, off: int, proto: int) -> Tuple[Optional[int], Optional[int]]:
110
+ if proto in (6, 17, 132) and len(payload) >= off + 4: # TCP / UDP / SCTP
111
+ src, dst = struct.unpack_from("!HH", payload, off)
112
+ return src, dst
113
+ return None, None
114
+
115
+
116
+ def _result(src_ip, dst_ip, proto, src_port, dst_port) -> Dict[str, Optional[object]]:
117
+ return {
118
+ "src_ip": src_ip,
119
+ "dst_ip": dst_ip,
120
+ "proto": IP_PROTO_NAMES.get(proto, str(proto)) if proto is not None else None,
121
+ "src_port": src_port,
122
+ "dst_port": dst_port,
123
+ }
124
+
125
+
126
+ def _decode_ipv4(p: bytes) -> Dict[str, Optional[object]]:
127
+ if len(p) < 20:
128
+ return dict(_EMPTY)
129
+ ihl = (p[0] & 0x0F) * 4
130
+ proto = p[9]
131
+ src_ip = _ip_str(p[12:16], socket.AF_INET)
132
+ dst_ip = _ip_str(p[16:20], socket.AF_INET)
133
+ src_port, dst_port = _ports(p, ihl, proto)
134
+ return _result(src_ip, dst_ip, proto, src_port, dst_port)
135
+
136
+
137
+ def _decode_ipv6(p: bytes) -> Dict[str, Optional[object]]:
138
+ if len(p) < 40:
139
+ return dict(_EMPTY)
140
+ next_hdr = p[6]
141
+ src_ip = _ip_str(p[8:24], socket.AF_INET6)
142
+ dst_ip = _ip_str(p[24:40], socket.AF_INET6)
143
+
144
+ off = 40
145
+ # Walk extension headers (each is 8-byte aligned via hdr_ext_len) to the
146
+ # transport header. Fragment headers (44) are a fixed 8 bytes.
147
+ while next_hdr in _IPV6_EXT_HEADERS or next_hdr == 44:
148
+ if off + 2 > len(p):
149
+ return _result(src_ip, dst_ip, None, None, None)
150
+ ext_next = p[off]
151
+ ext_len = 8 if next_hdr == 44 else (p[off + 1] + 1) * 8
152
+ next_hdr = ext_next
153
+ off += ext_len
154
+
155
+ src_port, dst_port = _ports(p, off, next_hdr)
156
+ return _result(src_ip, dst_ip, next_hdr, src_port, dst_port)
pcapml/_pcapng.py ADDED
@@ -0,0 +1,193 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Pure-Python streaming reader for the pcapng blocks pcapml emits.
3
+
4
+ This module knows nothing about pcapml labels; it just yields packets with the
5
+ raw per-packet comment string. Higher layers parse the comment. Only the block
6
+ types pcapml writes are interpreted (SHB, IDB, EPB); anything else is skipped.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import struct
12
+ from dataclasses import dataclass
13
+ from typing import BinaryIO, Iterator, List, Optional, Union
14
+
15
+ # Block type codes.
16
+ SECTION_HEADER = 0x0A0D0D0A
17
+ INTERFACE_DESC = 0x00000001
18
+ ENHANCED_PACKET = 0x00000006
19
+ BYTE_ORDER_MAGIC = 0x1A2B3C4D
20
+
21
+ # Option codes.
22
+ OPT_END = 0
23
+ OPT_COMMENT = 1
24
+ IDB_OPT_TSRESOL = 9
25
+
26
+ # The 4 bytes of the Section Header Block type are byte-order independent, which
27
+ # is exactly how a reader bootstraps endianness for the rest of the section.
28
+ _SHB_MAGIC_BYTES = b"\x0a\x0d\x0d\x0a"
29
+
30
+
31
+ @dataclass
32
+ class Interface:
33
+ """An Interface Description Block: link type and timestamp resolution."""
34
+
35
+ link_type: int
36
+ snap_len: int
37
+ ts_resol: float = 1e-6 # seconds per timestamp tick (pcapng default: microsec)
38
+
39
+
40
+ @dataclass
41
+ class Packet:
42
+ """One Enhanced Packet Block with its pcapml comment."""
43
+
44
+ interface_id: int
45
+ ts_ticks: int
46
+ cap_len: int
47
+ orig_len: int
48
+ data: bytes
49
+ comment: str
50
+ link_type: int
51
+ ts_resol: float
52
+
53
+ @property
54
+ def timestamp(self) -> float:
55
+ """Capture time as POSIX seconds (float)."""
56
+ return self.ts_ticks * self.ts_resol
57
+
58
+ @property
59
+ def ts_nanos(self) -> int:
60
+ """Capture time as integer nanoseconds (loss-free for ns/us/ms resolutions)."""
61
+ return int(round(self.ts_ticks * self.ts_resol * 1e9))
62
+
63
+
64
+ def _read_exact(f: BinaryIO, n: int) -> Optional[bytes]:
65
+ """Read exactly n bytes or return None at a clean EOF."""
66
+ chunks = []
67
+ remaining = n
68
+ while remaining > 0:
69
+ chunk = f.read(remaining)
70
+ if not chunk:
71
+ return None
72
+ chunks.append(chunk)
73
+ remaining -= len(chunk)
74
+ return b"".join(chunks)
75
+
76
+
77
+ def _ts_resol_from_byte(v: int) -> float:
78
+ """Decode an if_tsresol option byte into seconds-per-tick."""
79
+ if v & 0x80:
80
+ return 2.0 ** -(v & 0x7F)
81
+ return 10.0 ** -v
82
+
83
+
84
+ def _iter_options(body: bytes, off: int, le: str):
85
+ """Yield (code, value_bytes) for the options region starting at off."""
86
+ n = len(body)
87
+ while off + 4 <= n:
88
+ code, length = struct.unpack_from(le + "HH", body, off)
89
+ if code == OPT_END:
90
+ break
91
+ start = off + 4
92
+ end = start + length
93
+ if end > n:
94
+ break
95
+ yield code, body[start:end]
96
+ off = end + ((4 - length % 4) % 4) # options are padded to 32 bits
97
+
98
+
99
+ def iter_packets(source: Union[str, BinaryIO]) -> Iterator[Packet]:
100
+ """Yield :class:`Packet` for every Enhanced Packet Block in a pcapng stream.
101
+
102
+ ``source`` may be a filesystem path or an already-open binary file object.
103
+ """
104
+ if isinstance(source, str):
105
+ with open(source, "rb") as f:
106
+ yield from _iter_packets(f)
107
+ else:
108
+ yield from _iter_packets(source)
109
+
110
+
111
+ def _iter_packets(f: BinaryIO) -> Iterator[Packet]:
112
+ le = "<"
113
+ interfaces: List[Interface] = []
114
+
115
+ while True:
116
+ type_bytes = _read_exact(f, 4)
117
+ if type_bytes is None:
118
+ return
119
+
120
+ if type_bytes == _SHB_MAGIC_BYTES:
121
+ # Section Header Block: re-establish endianness and reset interfaces.
122
+ len_bytes = _read_exact(f, 4)
123
+ magic_bytes = _read_exact(f, 4)
124
+ if len_bytes is None or magic_bytes is None:
125
+ return
126
+ le = "<" if struct.unpack("<I", magic_bytes)[0] == BYTE_ORDER_MAGIC else ">"
127
+ block_len = struct.unpack(le + "I", len_bytes)[0]
128
+ # 12 bytes already consumed (type + length + magic).
129
+ if _read_exact(f, block_len - 12) is None:
130
+ return
131
+ interfaces = []
132
+ continue
133
+
134
+ len_bytes = _read_exact(f, 4)
135
+ if len_bytes is None:
136
+ return
137
+ block_type = struct.unpack(le + "I", type_bytes)[0]
138
+ block_len = struct.unpack(le + "I", len_bytes)[0]
139
+ if block_len < 12:
140
+ return # corrupt: length must cover type+length+trailer
141
+ rest = _read_exact(f, block_len - 8)
142
+ if rest is None:
143
+ return
144
+ body = rest[:-4] # drop the trailing redundant block length
145
+
146
+ if block_type == INTERFACE_DESC:
147
+ interfaces.append(_parse_idb(body, le))
148
+ elif block_type == ENHANCED_PACKET:
149
+ pkt = _parse_epb(body, le, interfaces)
150
+ if pkt is not None:
151
+ yield pkt
152
+ # Other block types (SPB, NRB, ISB, ...) carry no pcapml labels: skip.
153
+
154
+
155
+ def _parse_idb(body: bytes, le: str) -> Interface:
156
+ link_type, _reserved, snap_len = struct.unpack_from(le + "HHI", body, 0)
157
+ ts_resol = 1e-6
158
+ for code, val in _iter_options(body, 8, le):
159
+ if code == IDB_OPT_TSRESOL and val:
160
+ ts_resol = _ts_resol_from_byte(val[0])
161
+ return Interface(link_type=link_type, snap_len=snap_len, ts_resol=ts_resol)
162
+
163
+
164
+ def _parse_epb(body: bytes, le: str, interfaces: List[Interface]) -> Optional[Packet]:
165
+ if len(body) < 20:
166
+ return None
167
+ iface_id, ts_high, ts_low, cap_len, orig_len = struct.unpack_from(le + "IIIII", body, 0)
168
+ data = body[20 : 20 + cap_len]
169
+ if len(data) < cap_len:
170
+ return None
171
+
172
+ opt_off = 20 + cap_len + ((4 - cap_len % 4) % 4)
173
+ comment = ""
174
+ for code, val in _iter_options(body, opt_off, le):
175
+ if code == OPT_COMMENT:
176
+ comment = val.rstrip(b"\x00").decode("utf-8", "replace")
177
+ break
178
+
179
+ if 0 <= iface_id < len(interfaces):
180
+ iface = interfaces[iface_id]
181
+ else:
182
+ iface = Interface(link_type=0, snap_len=0)
183
+
184
+ return Packet(
185
+ interface_id=iface_id,
186
+ ts_ticks=(ts_high << 32) | ts_low,
187
+ cap_len=cap_len,
188
+ orig_len=orig_len,
189
+ data=data,
190
+ comment=comment,
191
+ link_type=iface.link_type,
192
+ ts_resol=iface.ts_resol,
193
+ )
pcapml/py.typed ADDED
File without changes
pcapml/reader.py ADDED
@@ -0,0 +1,187 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """High-level pcapml reader: pcapng -> pandas DataFrame and per-sample grouping."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
8
+
9
+ import pandas as pd
10
+
11
+ from . import _decode
12
+ from ._pcapng import Packet, iter_packets
13
+
14
+ # Column order for the per-packet DataFrame.
15
+ COLUMNS = [
16
+ "timestamp",
17
+ "sample_id",
18
+ "label",
19
+ "direction",
20
+ "dst",
21
+ "src_ip",
22
+ "dst_ip",
23
+ "proto",
24
+ "src_port",
25
+ "dst_port",
26
+ "length",
27
+ "raw",
28
+ ]
29
+
30
+ # Comment keys already surfaced as dedicated core columns. Any *other* key found
31
+ # in a comment is promoted to its own column, named after the key.
32
+ _CORE_KEYS = frozenset({"s", "proc", "label", "dir", "d", "dst"})
33
+
34
+
35
+ def parse_comment(comment: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Dict[str, str]]:
36
+ """Parse a pcapml EPB comment into (sample_id, label, direction, dst, metadata).
37
+
38
+ Handles both the keyed form (``s=0,proc=curl,dir=lan2wan,dst=example.com``)
39
+ and the legacy positional form (``<sample_id>,<label>``). ``metadata`` holds
40
+ every ``key=value`` pair found, verbatim.
41
+ """
42
+ meta: Dict[str, str] = {}
43
+ positional: List[str] = []
44
+ if comment:
45
+ for field in comment.split(","):
46
+ key, sep, value = field.partition("=")
47
+ if sep:
48
+ meta[key] = value
49
+ else:
50
+ positional.append(field)
51
+
52
+ sample_id = meta.get("s")
53
+ if sample_id is None and positional:
54
+ sample_id = positional[0]
55
+
56
+ label = meta.get("proc") or meta.get("label")
57
+ if label is None and len(positional) > 1:
58
+ label = positional[1]
59
+
60
+ direction = meta.get("dir") or meta.get("d")
61
+ dst = meta.get("dst")
62
+
63
+ return sample_id, label, direction, dst, meta
64
+
65
+
66
+ def _row(pkt: Packet) -> dict:
67
+ sample_id, label, direction, dst, meta = parse_comment(pkt.comment)
68
+ decoded = _decode.decode(pkt.data, pkt.link_type)
69
+ row = {
70
+ "timestamp": pkt.ts_nanos,
71
+ "sample_id": sample_id,
72
+ "label": label,
73
+ "direction": direction,
74
+ "dst": dst,
75
+ "src_ip": decoded["src_ip"],
76
+ "dst_ip": decoded["dst_ip"],
77
+ "proto": decoded["proto"],
78
+ "src_port": decoded["src_port"],
79
+ "dst_port": decoded["dst_port"],
80
+ "length": pkt.orig_len,
81
+ "raw": pkt.data,
82
+ }
83
+ # Promote any non-core comment key to its own column. If a key would clash
84
+ # with a built-in column, prefix it with "meta_" rather than clobbering.
85
+ for key, value in meta.items():
86
+ if key in _CORE_KEYS:
87
+ continue
88
+ col = key if key not in row else "meta_" + key
89
+ row[col] = value
90
+ return row
91
+
92
+
93
+ def _finalize(df: pd.DataFrame) -> pd.DataFrame:
94
+ """Apply consistent dtypes to a freshly built packet DataFrame.
95
+
96
+ Core columns get fixed dtypes and a stable order; any extra columns
97
+ (promoted from arbitrary comment keys) follow, typed as strings.
98
+ """
99
+ if df.empty:
100
+ df = pd.DataFrame({c: pd.Series(dtype="object") for c in COLUMNS})
101
+ for col in COLUMNS:
102
+ if col not in df.columns:
103
+ df[col] = pd.NA
104
+ extra_cols = [c for c in df.columns if c not in COLUMNS]
105
+ df = df[COLUMNS + extra_cols]
106
+
107
+ df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ns", utc=True)
108
+ for col in ("sample_id", "label", "direction", "dst", "src_ip", "dst_ip", "proto"):
109
+ df[col] = df[col].astype("string")
110
+ for col in ("src_port", "dst_port", "length"):
111
+ df[col] = df[col].astype("Int64")
112
+ for col in extra_cols:
113
+ df[col] = df[col].astype("string")
114
+ return df
115
+
116
+
117
+ def read_pcapml(source: Union[str], include_raw: bool = True) -> pd.DataFrame:
118
+ """Read a pcapml-labeled pcapng file into a pandas DataFrame, one row per packet.
119
+
120
+ Parameters
121
+ ----------
122
+ source:
123
+ Path to a ``.pcapng`` file produced by pcapml.
124
+ include_raw:
125
+ Keep the full packet bytes in the ``raw`` column (default ``True``).
126
+ Set ``False`` to drop it and save memory.
127
+ """
128
+ df = _finalize(pd.DataFrame([_row(p) for p in iter_packets(source)]))
129
+ if not include_raw:
130
+ df = df.drop(columns=["raw"])
131
+ return df
132
+
133
+
134
+ # Backwards/ergonomic alias.
135
+ read = read_pcapml
136
+
137
+
138
+ @dataclass
139
+ class Sample:
140
+ """All packets that share one sample ID, plus that sample's metadata."""
141
+
142
+ sample_id: Optional[str]
143
+ label: Optional[str]
144
+ metadata: Dict[str, str]
145
+ df: pd.DataFrame
146
+
147
+ # Aliases matching the README's sampler() examples.
148
+ @property
149
+ def sid(self) -> Optional[str]:
150
+ return self.sample_id
151
+
152
+ @property
153
+ def packets(self) -> pd.DataFrame:
154
+ return self.df
155
+
156
+ def __len__(self) -> int:
157
+ return len(self.df)
158
+
159
+
160
+ def samples(source: Union[str]) -> Iterator[Sample]:
161
+ """Iterate over samples, grouping consecutive packets by sample ID.
162
+
163
+ pcapml writes (and its ``sort`` subcommand guarantees) that packets of a
164
+ sample are contiguous, so grouping is streaming and order-preserving.
165
+ """
166
+ current_id: Optional[str] = None
167
+ rows: List[dict] = []
168
+ meta: Dict[str, str] = {}
169
+ label: Optional[str] = None
170
+ have_group = False
171
+
172
+ for pkt in iter_packets(source):
173
+ sample_id, lbl, _direction, _dst, pkt_meta = parse_comment(pkt.comment)
174
+ if not have_group:
175
+ current_id, label, meta, have_group = sample_id, lbl, pkt_meta, True
176
+ elif sample_id != current_id:
177
+ yield Sample(current_id, label, meta, _finalize(pd.DataFrame(rows)))
178
+ rows = []
179
+ current_id, label, meta = sample_id, lbl, pkt_meta
180
+ rows.append(_row(pkt))
181
+
182
+ if have_group:
183
+ yield Sample(current_id, label, meta, _finalize(pd.DataFrame(rows)))
184
+
185
+
186
+ # README-compatible alias.
187
+ sampler = samples
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: pcapml
3
+ Version: 0.1.0
4
+ Summary: Read pcapml-labeled network traces (pcapng) into pandas DataFrames. Pure Python.
5
+ Author-email: Paul Schmitt <paul.schmitt@gmail.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/nprint/pcapml
8
+ Project-URL: Repository, https://github.com/nprint/pcapml
9
+ Keywords: pcap,pcapng,network,traffic,machine-learning,pandas,dataset
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: System :: Networking :: Monitoring
14
+ Classifier: Topic :: Scientific/Engineering
15
+ Classifier: Intended Audience :: Science/Research
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: pandas>=1.3
20
+ Provides-Extra: test
21
+ Requires-Dist: pytest>=7.0; extra == "test"
22
+ Dynamic: license-file
23
+
24
+ # pcapml (Python)
25
+
26
+ Read [pcapml](https://github.com/nprint/pcapml)-labeled network traces into
27
+ [pandas](https://pandas.pydata.org/) DataFrames. **Pure Python** — no native
28
+ extensions, no libpcap, no Go binary required. The only dependency is pandas.
29
+
30
+ pcapml stores ground-truth labels as per-packet comments inside standard
31
+ pcapng files. This package parses those files directly so you can go from a
32
+ labeled capture to a tidy DataFrame in one line.
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install pcapml
38
+ ```
39
+
40
+ Or from this repository:
41
+
42
+ ```bash
43
+ pip install ./python
44
+ ```
45
+
46
+ ## Quick start
47
+
48
+ ```python
49
+ import pcapml
50
+
51
+ df = pcapml.read_pcapml("dataset.pcapng")
52
+ print(df.head())
53
+ ```
54
+
55
+ ```
56
+ timestamp sample_id label direction dst src_ip dst_ip proto src_port dst_port length
57
+ 2026-03-19 22:25:24.189764096+00:00 0 curl lan2wan example.com 192.168.1.188 104.18.26.120 TCP 47954 80 60
58
+ 2026-03-19 22:25:24.195076864+00:00 0 curl wan2lan example.com 104.18.26.120 192.168.1.188 TCP 80 47954 60
59
+ ...
60
+ ```
61
+
62
+ ### Columns
63
+
64
+ | Column | Dtype | Description |
65
+ |--------|-------|-------------|
66
+ | `timestamp` | `datetime64[ns, UTC]` | Packet capture time |
67
+ | `sample_id` | `string` | pcapml sample ID (kept as string — IDs can exceed 2⁶⁴) |
68
+ | `label` | `string` | Sample label / process name |
69
+ | `direction` | `string` | Direction tag if present (`lan2wan`, `wan2lan`, `e`, `i`, …) |
70
+ | `dst` | `string` | Resolved destination domain, if any |
71
+ | `src_ip`, `dst_ip` | `string` | L3 addresses (IPv4 or IPv6) |
72
+ | `proto` | `string` | `TCP`, `UDP`, `ICMP`, … (or the numeric value) |
73
+ | `src_port`, `dst_port` | `Int64` | L4 ports (nullable) |
74
+ | `length` | `Int64` | Original on-wire packet length |
75
+ | `raw` | `bytes` | Full captured packet bytes |
76
+
77
+ **Arbitrary comment keys** are promoted automatically. The columns above are the
78
+ fixed core schema; any other `key=value` pair in a comment becomes its own
79
+ column named after the key (sparse keys fill with `<NA>`). For example a comment
80
+ `s=0,proc=curl,dst=youtube.com,asn=15169` yields an extra `asn` column. If a key
81
+ collides with a core column name it is prefixed (`meta_<key>`) instead of
82
+ overwriting it.
83
+
84
+ Pass `include_raw=False` to drop the `raw` column and save memory:
85
+
86
+ ```python
87
+ df = pcapml.read_pcapml("dataset.pcapng", include_raw=False)
88
+ ```
89
+
90
+ The header decoding (IPv4/IPv6 + TCP/UDP) is intentionally lightweight. The
91
+ `raw` bytes are always available if you want a full dissector such as
92
+ [dpkt](https://github.com/kbandla/dpkt) or [scapy](https://scapy.net/):
93
+
94
+ ```python
95
+ import dpkt
96
+ df["eth"] = df["raw"].apply(dpkt.ethernet.Ethernet) # for Ethernet linktype
97
+ ```
98
+
99
+ ## Iterating by sample
100
+
101
+ For ML workflows it's often handier to work one sample at a time. `samples()`
102
+ groups consecutive packets by sample ID (pcapml writes each sample's packets
103
+ contiguously; use `pcapml sort` first if yours aren't):
104
+
105
+ ```python
106
+ for sample in pcapml.samples("dataset.pcapng"):
107
+ print(sample.sample_id, sample.label, len(sample))
108
+ sample.df # a DataFrame of just this sample's packets
109
+ sample.metadata # dict of every key=value pair from the comment
110
+ ```
111
+
112
+ `Sample` also exposes `.sid` and `.packets` aliases.
113
+
114
+ ## Label formats
115
+
116
+ Both pcapml comment encodings are supported transparently:
117
+
118
+ - **Keyed** (eBPF / gateway capture): `s=0,proc=curl,dir=lan2wan,dst=example.com`
119
+ - **Legacy positional**: `18205618432581910911,windows-10`
120
+
121
+ ## Lower-level access
122
+
123
+ If you don't want pandas in the loop, iterate raw packets directly:
124
+
125
+ ```python
126
+ from pcapml import iter_packets, parse_comment
127
+
128
+ for pkt in iter_packets("dataset.pcapng"):
129
+ sid, label, direction, dst, meta = parse_comment(pkt.comment)
130
+ pkt.timestamp # POSIX seconds (float)
131
+ pkt.data # raw bytes
132
+ pkt.link_type # pcapng linktype (1 = Ethernet, 101 = RAW IPv4)
133
+ ```
134
+
135
+ ## License
136
+
137
+ Apache-2.0.
@@ -0,0 +1,10 @@
1
+ pcapml/__init__.py,sha256=Sz_cR0ZqNZJf5ri-OoLj5hoEIcZaZDY0jOBhfziVrhc,1069
2
+ pcapml/_decode.py,sha256=i4W-BZ6quqWAz2S4UedAKO37TLPycY3DlLqFpQzHrkY,4789
3
+ pcapml/_pcapng.py,sha256=mmLVAuHV6e4Z2REUQn0EiQOsEGbbjWEFV6utQ3afZJw,6080
4
+ pcapml/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ pcapml/reader.py,sha256=oqOn0GR_-3aXJE-jqhkS1LhkfE2Ifi03Aks0oamXiZ8,5825
6
+ pcapml-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ pcapml-0.1.0.dist-info/METADATA,sha256=t9K7q-RxmeNO-SCKbvlH4iM5bzBLBz_k9e_MFzdpGPk,4799
8
+ pcapml-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ pcapml-0.1.0.dist-info/top_level.txt,sha256=V_WXwO6v1GJSMHkTAAXUEssN8gts_yfeeZfymQxkTq0,7
10
+ pcapml-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,201 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
@@ -0,0 +1 @@
1
+ pcapml