FlowAnalyzer 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- FlowAnalyzer/FlowAnalyzer.py +1 -3
- FlowAnalyzer/PacketParser.py +39 -41
- FlowAnalyzer/PcapSplitter.py +128 -0
- {flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.5.dist-info}/METADATA +1 -1
- flowanalyzer-0.4.5.dist-info/RECORD +12 -0
- flowanalyzer-0.4.4.dist-info/RECORD +0 -11
- {flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.5.dist-info}/WHEEL +0 -0
- {flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.5.dist-info}/top_level.txt +0 -0
FlowAnalyzer/FlowAnalyzer.py
CHANGED
FlowAnalyzer/PacketParser.py
CHANGED
|
@@ -9,7 +9,7 @@ from .logging_config import logger
|
|
|
9
9
|
|
|
10
10
|
class PacketParser:
|
|
11
11
|
@staticmethod
|
|
12
|
-
def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes
|
|
12
|
+
def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
|
|
13
13
|
"""
|
|
14
14
|
解析 Tshark 输出的一行数据
|
|
15
15
|
row definition (all bytes):
|
|
@@ -20,19 +20,18 @@ class PacketParser:
|
|
|
20
20
|
4: tcp.payload
|
|
21
21
|
5: frame.time_epoch
|
|
22
22
|
6: exported_pdu.exported_pdu
|
|
23
|
-
7: http.request.full_uri
|
|
24
|
-
8:
|
|
25
|
-
9: tcp.segment.count
|
|
23
|
+
7: http.request.full_uri
|
|
24
|
+
8: tcp.segment.count
|
|
26
25
|
"""
|
|
27
26
|
frame_num = int(row[3])
|
|
28
27
|
request_in = int(row[1]) if row[1] else frame_num
|
|
29
28
|
# Decode only URI to string
|
|
30
29
|
full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
|
|
31
30
|
time_epoch = float(row[5])
|
|
32
|
-
http_file_data = row[8] if len(row) > 8 else b""
|
|
33
31
|
|
|
34
32
|
# Logic for Raw Packet (Header Source)
|
|
35
|
-
|
|
33
|
+
# Previous index 9 is now 8 since we removed http.file_data
|
|
34
|
+
is_reassembled = len(row) > 8 and row[8]
|
|
36
35
|
|
|
37
36
|
if is_reassembled and row[2]:
|
|
38
37
|
full_request = row[2]
|
|
@@ -42,7 +41,7 @@ class PacketParser:
|
|
|
42
41
|
# Fallback (e.g. Exported PDU)
|
|
43
42
|
full_request = row[2] if row[2] else (row[6] if row[6] else b"")
|
|
44
43
|
|
|
45
|
-
return frame_num, request_in, time_epoch, full_uri, full_request
|
|
44
|
+
return frame_num, request_in, time_epoch, full_uri, full_request
|
|
46
45
|
|
|
47
46
|
@staticmethod
|
|
48
47
|
def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
|
|
@@ -67,6 +66,10 @@ class PacketParser:
|
|
|
67
66
|
while cursor < total_len:
|
|
68
67
|
newline_idx = file_data.find(b"\n", cursor)
|
|
69
68
|
if newline_idx == -1:
|
|
69
|
+
# If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
|
|
70
|
+
# But for robustness we might perform a "best effort" or just stop.
|
|
71
|
+
# raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
|
|
72
|
+
# Let's assume non-chunked if strict format not found
|
|
70
73
|
raise ValueError("Not chunked data")
|
|
71
74
|
|
|
72
75
|
size_line = file_data[cursor:newline_idx].strip()
|
|
@@ -74,13 +77,21 @@ class PacketParser:
|
|
|
74
77
|
cursor = newline_idx + 1
|
|
75
78
|
continue
|
|
76
79
|
|
|
77
|
-
|
|
80
|
+
try:
|
|
81
|
+
chunk_size = int(size_line, 16)
|
|
82
|
+
except ValueError:
|
|
83
|
+
raise ValueError("Invalid chunk size")
|
|
84
|
+
|
|
78
85
|
if chunk_size == 0:
|
|
79
86
|
break
|
|
80
87
|
|
|
81
88
|
data_start = newline_idx + 1
|
|
82
89
|
data_end = data_start + chunk_size
|
|
83
90
|
|
|
91
|
+
# Robustness check
|
|
92
|
+
if data_start > total_len:
|
|
93
|
+
break
|
|
94
|
+
|
|
84
95
|
if data_end > total_len:
|
|
85
96
|
chunks.append(file_data[data_start:])
|
|
86
97
|
break
|
|
@@ -88,51 +99,38 @@ class PacketParser:
|
|
|
88
99
|
chunks.append(file_data[data_start:data_end])
|
|
89
100
|
|
|
90
101
|
cursor = data_end
|
|
102
|
+
# Skip CRLF after chunk data
|
|
91
103
|
while cursor < total_len and file_data[cursor] in (13, 10):
|
|
92
104
|
cursor += 1
|
|
93
105
|
|
|
94
106
|
return b"".join(chunks)
|
|
95
107
|
|
|
96
108
|
@staticmethod
|
|
97
|
-
def extract_http_file_data(full_request: bytes
|
|
109
|
+
def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
|
|
98
110
|
"""
|
|
99
111
|
提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
|
|
100
112
|
"""
|
|
101
113
|
header = b""
|
|
102
114
|
file_data = b""
|
|
103
115
|
|
|
116
|
+
if not full_request:
|
|
117
|
+
return b"", b""
|
|
104
118
|
try:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
except binascii.Error:
|
|
117
|
-
logger.warning("解析 http.file_data Hex 失败,尝试回退到原始方式")
|
|
118
|
-
|
|
119
|
-
# --- 3. 回退模式 (Fallback) ---
|
|
120
|
-
if full_request and not file_data:
|
|
121
|
-
raw_bytes = binascii.unhexlify(full_request)
|
|
122
|
-
_, body_part = PacketParser.split_http_headers(raw_bytes)
|
|
123
|
-
|
|
124
|
-
with contextlib.suppress(Exception):
|
|
125
|
-
body_part = PacketParser.dechunk_http_response(body_part)
|
|
126
|
-
|
|
127
|
-
with contextlib.suppress(Exception):
|
|
128
|
-
if body_part.startswith(b"\x1f\x8b"):
|
|
129
|
-
body_part = gzip.decompress(body_part)
|
|
130
|
-
|
|
131
|
-
file_data = body_part
|
|
119
|
+
raw_bytes = binascii.unhexlify(full_request)
|
|
120
|
+
header, body_part = PacketParser.split_http_headers(raw_bytes)
|
|
121
|
+
|
|
122
|
+
with contextlib.suppress(Exception):
|
|
123
|
+
body_part = PacketParser.dechunk_http_response(body_part)
|
|
124
|
+
|
|
125
|
+
with contextlib.suppress(Exception):
|
|
126
|
+
if body_part.startswith(b"\x1f\x8b"):
|
|
127
|
+
body_part = gzip.decompress(body_part)
|
|
128
|
+
|
|
129
|
+
file_data = body_part
|
|
132
130
|
return header, file_data
|
|
133
131
|
|
|
134
|
-
except
|
|
135
|
-
logger.error(
|
|
132
|
+
except binascii.Error:
|
|
133
|
+
logger.error("Hex转换失败")
|
|
136
134
|
return b"", b""
|
|
137
135
|
except Exception as e:
|
|
138
136
|
logger.error(f"解析HTTP数据未知错误: {e}")
|
|
@@ -149,12 +147,12 @@ class PacketParser:
|
|
|
149
147
|
|
|
150
148
|
row = line.split(b"\t")
|
|
151
149
|
try:
|
|
152
|
-
frame_num, request_in, time_epoch, full_uri, full_request
|
|
150
|
+
frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
|
|
153
151
|
|
|
154
|
-
if not full_request
|
|
152
|
+
if not full_request:
|
|
155
153
|
return None
|
|
156
154
|
|
|
157
|
-
header, file_data = PacketParser.extract_http_file_data(full_request
|
|
155
|
+
header, file_data = PacketParser.extract_http_file_data(full_request)
|
|
158
156
|
|
|
159
157
|
# row[0] is http.response.code (bytes)
|
|
160
158
|
is_response = bool(row[0])
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import dpkt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PcapSplitter:
|
|
10
|
+
"""
|
|
11
|
+
Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
|
|
12
|
+
based on TCP flows, dynamically balanced for parallel processing.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, pcap_file: str, output_dir: str):
|
|
16
|
+
self.pcap_file = pcap_file
|
|
17
|
+
self.output_dir = output_dir
|
|
18
|
+
|
|
19
|
+
def get_stream_key(self, tcp, ip) -> Tuple:
|
|
20
|
+
"""Generate a 5-tuple key for the flow."""
|
|
21
|
+
src = ip.src
|
|
22
|
+
dst = ip.dst
|
|
23
|
+
sport = tcp.sport
|
|
24
|
+
dport = tcp.dport
|
|
25
|
+
# Canonicalize bidirectional flows to the same key
|
|
26
|
+
key1 = (src, dst, sport, dport)
|
|
27
|
+
key2 = (dst, src, dport, sport)
|
|
28
|
+
return key1 if key1 < key2 else key2
|
|
29
|
+
|
|
30
|
+
def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
|
|
31
|
+
"""
|
|
32
|
+
Split the pcap file into balanced chunks based on stream volume (bytes).
|
|
33
|
+
Uses a Greedy Partition Algorithm (Longest Processing Time first).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
threshold_mb: File size threshold in MB. If smaller, do not split.
|
|
37
|
+
default_chunks: Number of chunks to split into if threshold is exceeded.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of generated file paths (or original file if not split).
|
|
41
|
+
"""
|
|
42
|
+
if not os.path.exists(self.pcap_file):
|
|
43
|
+
raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
|
|
44
|
+
|
|
45
|
+
file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
|
|
46
|
+
if file_size_mb < threshold_mb:
|
|
47
|
+
print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
|
|
48
|
+
return [self.pcap_file]
|
|
49
|
+
|
|
50
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
start_time = time.time()
|
|
53
|
+
# Dictionary to store packets: stream_key -> list of (ts, buf)
|
|
54
|
+
streams = defaultdict(list)
|
|
55
|
+
# Dictionary to store total size: stream_key -> total_bytes
|
|
56
|
+
stream_sizes = defaultdict(int)
|
|
57
|
+
|
|
58
|
+
# 1. Read and Group Packets
|
|
59
|
+
print(f"Reading {self.pcap_file}...")
|
|
60
|
+
with open(self.pcap_file, "rb") as f:
|
|
61
|
+
if self.pcap_file.lower().endswith(".pcapng"):
|
|
62
|
+
reader = dpkt.pcapng.Reader(f)
|
|
63
|
+
else:
|
|
64
|
+
reader = dpkt.pcap.Reader(f)
|
|
65
|
+
|
|
66
|
+
for ts, buf in reader:
|
|
67
|
+
try:
|
|
68
|
+
eth = dpkt.ethernet.Ethernet(buf)
|
|
69
|
+
if not isinstance(eth.data, dpkt.ip.IP):
|
|
70
|
+
continue
|
|
71
|
+
ip = eth.data
|
|
72
|
+
if not isinstance(ip.data, dpkt.tcp.TCP):
|
|
73
|
+
continue
|
|
74
|
+
tcp = ip.data
|
|
75
|
+
|
|
76
|
+
key = self.get_stream_key(tcp, ip)
|
|
77
|
+
streams[key].append((ts, buf))
|
|
78
|
+
stream_sizes[key] += len(buf)
|
|
79
|
+
except Exception:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
total_streams = len(streams)
|
|
83
|
+
print(f"Found {total_streams} TCP streams.")
|
|
84
|
+
|
|
85
|
+
if total_streams == 0:
|
|
86
|
+
print("No TCP streams found to split.")
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
# 2. Assign Streams to Buckets (Greedy LPT Algorithm)
|
|
90
|
+
num_chunks = min(default_chunks, total_streams)
|
|
91
|
+
|
|
92
|
+
# Sort streams by size (descending)
|
|
93
|
+
sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
|
|
94
|
+
|
|
95
|
+
# Buckets: list of (current_size, batch_index, list_of_keys)
|
|
96
|
+
# We perform standard list sort to find min bucket, sufficient for small N
|
|
97
|
+
buckets = [[0, i, []] for i in range(num_chunks)]
|
|
98
|
+
|
|
99
|
+
for key, size in sorted_streams:
|
|
100
|
+
# Find bucket with smallest current size
|
|
101
|
+
buckets.sort(key=lambda x: x[0])
|
|
102
|
+
smallest_bucket = buckets[0]
|
|
103
|
+
|
|
104
|
+
# Add stream to this bucket
|
|
105
|
+
smallest_bucket[0] += size
|
|
106
|
+
smallest_bucket[2].append(key)
|
|
107
|
+
|
|
108
|
+
print(f"Splitting into {num_chunks} files with volume balancing...")
|
|
109
|
+
generated_files = []
|
|
110
|
+
|
|
111
|
+
# 3. Write Batches
|
|
112
|
+
# Sort buckets by index ensures file naming order 0, 1, 2...
|
|
113
|
+
buckets.sort(key=lambda x: x[1])
|
|
114
|
+
|
|
115
|
+
for size, i, batch_keys in buckets:
|
|
116
|
+
out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
|
|
117
|
+
generated_files.append(out_file_path)
|
|
118
|
+
|
|
119
|
+
with open(out_file_path, "wb") as f:
|
|
120
|
+
writer = dpkt.pcap.Writer(f)
|
|
121
|
+
for key in batch_keys:
|
|
122
|
+
for ts, buf in streams[key]:
|
|
123
|
+
writer.writepkt(buf, ts)
|
|
124
|
+
|
|
125
|
+
print(f" - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
|
|
126
|
+
|
|
127
|
+
print(f"Split completed in {time.time() - start_time:.2f}s")
|
|
128
|
+
return generated_files
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
FlowAnalyzer/FlowAnalyzer.py,sha256=9SshWk5wf0XATI7W4eBiIpzEqeGFyQJs3on5ox-zrNQ,12666
|
|
2
|
+
FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
|
|
3
|
+
FlowAnalyzer/PacketParser.py,sha256=vdXUMFteSlIbOJ4y4_ikUIL3HwBCFBBgjevNL0jLozE,6174
|
|
4
|
+
FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
|
|
5
|
+
FlowAnalyzer/PcapSplitter.py,sha256=0E_vmLYYsE_gD34XTwG1XPx5kBg8ZchJspQEnkBoIdY,4855
|
|
6
|
+
FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
|
|
7
|
+
FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
|
|
8
|
+
flowanalyzer-0.4.5.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
|
|
9
|
+
flowanalyzer-0.4.5.dist-info/METADATA,sha256=oyoNqX8eZkkiNTrkz8qrZ6T7ofrW0lnFSxXoV2Q1wIU,6099
|
|
10
|
+
flowanalyzer-0.4.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
11
|
+
flowanalyzer-0.4.5.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
|
|
12
|
+
flowanalyzer-0.4.5.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
FlowAnalyzer/FlowAnalyzer.py,sha256=GPXZeM1uiLmv_-UKtIwYlfYJ450Etpbtt4V2i_MpLhQ,12721
|
|
2
|
-
FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
|
|
3
|
-
FlowAnalyzer/PacketParser.py,sha256=fGql84e-tu1PDsXh3NxctKaSh5YeYsJbh5ZCUe6Mo40,6329
|
|
4
|
-
FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
|
|
5
|
-
FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
|
|
6
|
-
FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
|
|
7
|
-
flowanalyzer-0.4.4.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
|
|
8
|
-
flowanalyzer-0.4.4.dist-info/METADATA,sha256=aUWcp8_ocQIgz0k_3IlhEHrkzsLHYo7XBPqEayIOGc0,6099
|
|
9
|
-
flowanalyzer-0.4.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
-
flowanalyzer-0.4.4.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
|
|
11
|
-
flowanalyzer-0.4.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|