FlowAnalyzer 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -199,9 +199,7 @@ class FlowAnalyzer:
199
199
  "-e",
200
200
  "http.request.full_uri", # 7
201
201
  "-e",
202
- "http.file_data", # 8
203
- "-e",
204
- "tcp.segment.count", # 9
202
+ "tcp.segment.count", # 8
205
203
  "-E",
206
204
  "header=n",
207
205
  "-E",
@@ -9,7 +9,7 @@ from .logging_config import logger
9
9
 
10
10
  class PacketParser:
11
11
  @staticmethod
12
- def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes, bytes]:
12
+ def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
13
13
  """
14
14
  解析 Tshark 输出的一行数据
15
15
  row definition (all bytes):
@@ -21,18 +21,17 @@ class PacketParser:
21
21
  5: frame.time_epoch
22
22
  6: exported_pdu.exported_pdu
23
23
  7: http.request.full_uri
24
- 8: http.file_data
25
- 9: tcp.segment.count
24
+ 8: tcp.segment.count
26
25
  """
27
26
  frame_num = int(row[3])
28
27
  request_in = int(row[1]) if row[1] else frame_num
29
28
  # Decode only URI to string
30
29
  full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
31
30
  time_epoch = float(row[5])
32
- http_file_data = row[8] if len(row) > 8 else b""
33
31
 
34
32
  # Logic for Raw Packet (Header Source)
35
- is_reassembled = len(row) > 9 and row[9]
33
+ # Previous index 9 is now 8 since we removed http.file_data
34
+ is_reassembled = len(row) > 8 and row[8]
36
35
 
37
36
  if is_reassembled and row[2]:
38
37
  full_request = row[2]
@@ -42,16 +41,18 @@ class PacketParser:
42
41
  # Fallback (e.g. Exported PDU)
43
42
  full_request = row[2] if row[2] else (row[6] if row[6] else b"")
44
43
 
45
- return frame_num, request_in, time_epoch, full_uri, full_request, http_file_data
44
+ return frame_num, request_in, time_epoch, full_uri, full_request
46
45
 
47
46
  @staticmethod
48
47
  def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
49
48
  headerEnd = file_data.find(b"\r\n\r\n")
50
49
  if headerEnd != -1:
51
50
  return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
52
- elif file_data.find(b"\n\n") != -1:
53
- headerEnd = file_data.index(b"\n\n") + 2
54
- return file_data[:headerEnd], file_data[headerEnd:]
51
+
52
+ headerEnd = file_data.find(b"\n\n")
53
+ if headerEnd != -1:
54
+ return file_data[: headerEnd + 2], file_data[headerEnd + 2 :]
55
+
55
56
  return b"", file_data
56
57
 
57
58
  @staticmethod
@@ -67,20 +68,36 @@ class PacketParser:
67
68
  while cursor < total_len:
68
69
  newline_idx = file_data.find(b"\n", cursor)
69
70
  if newline_idx == -1:
71
+ # If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
72
+ # But for robustness we might perform a "best effort" or just stop.
73
+ # raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
74
+ # Let's assume non-chunked if strict format not found
70
75
  raise ValueError("Not chunked data")
71
76
 
72
77
  size_line = file_data[cursor:newline_idx].strip()
78
+ # Handle chunk extension: ignore everything after ';'
79
+ if b";" in size_line:
80
+ size_line = size_line.split(b";", 1)[0].strip()
81
+
73
82
  if not size_line:
74
83
  cursor = newline_idx + 1
75
84
  continue
76
85
 
77
- chunk_size = int(size_line, 16)
86
+ try:
87
+ chunk_size = int(size_line, 16)
88
+ except ValueError:
89
+ raise ValueError("Invalid chunk size")
90
+
78
91
  if chunk_size == 0:
79
92
  break
80
93
 
81
94
  data_start = newline_idx + 1
82
95
  data_end = data_start + chunk_size
83
96
 
97
+ # Robustness check
98
+ if data_start > total_len:
99
+ break
100
+
84
101
  if data_end > total_len:
85
102
  chunks.append(file_data[data_start:])
86
103
  break
@@ -88,51 +105,38 @@ class PacketParser:
88
105
  chunks.append(file_data[data_start:data_end])
89
106
 
90
107
  cursor = data_end
108
+ # Skip CRLF after chunk data
91
109
  while cursor < total_len and file_data[cursor] in (13, 10):
92
110
  cursor += 1
93
111
 
94
112
  return b"".join(chunks)
95
113
 
96
114
  @staticmethod
97
- def extract_http_file_data(full_request: bytes, http_file_data: bytes) -> Tuple[bytes, bytes]:
115
+ def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
98
116
  """
99
117
  提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
100
118
  """
101
119
  header = b""
102
120
  file_data = b""
103
121
 
122
+ if not full_request:
123
+ return b"", b""
104
124
  try:
105
- # --- 1. 提取 Header ---
106
- if full_request:
107
- raw_bytes = binascii.unhexlify(full_request)
108
- h_part, _ = PacketParser.split_http_headers(raw_bytes)
109
- header = h_part
110
-
111
- # --- 2. 提取 Body ---
112
- if http_file_data:
113
- try:
114
- file_data = binascii.unhexlify(http_file_data)
115
- return header, file_data
116
- except binascii.Error:
117
- logger.warning("解析 http.file_data Hex 失败,尝试回退到原始方式")
118
-
119
- # --- 3. 回退模式 (Fallback) ---
120
- if full_request and not file_data:
121
- raw_bytes = binascii.unhexlify(full_request)
122
- _, body_part = PacketParser.split_http_headers(raw_bytes)
123
-
124
- with contextlib.suppress(Exception):
125
- body_part = PacketParser.dechunk_http_response(body_part)
126
-
127
- with contextlib.suppress(Exception):
128
- if body_part.startswith(b"\x1f\x8b"):
129
- body_part = gzip.decompress(body_part)
130
-
131
- file_data = body_part
125
+ raw_bytes = binascii.unhexlify(full_request)
126
+ header, body_part = PacketParser.split_http_headers(raw_bytes)
127
+
128
+ with contextlib.suppress(Exception):
129
+ body_part = PacketParser.dechunk_http_response(body_part)
130
+
131
+ with contextlib.suppress(Exception):
132
+ if body_part.startswith(b"\x1f\x8b"):
133
+ body_part = gzip.decompress(body_part)
134
+
135
+ file_data = body_part
132
136
  return header, file_data
133
137
 
134
- except ValueError as e:
135
- logger.error(f"Hex转换失败: {str(e)[:100]}...")
138
+ except binascii.Error:
139
+ logger.error("Hex转换失败")
136
140
  return b"", b""
137
141
  except Exception as e:
138
142
  logger.error(f"解析HTTP数据未知错误: {e}")
@@ -149,12 +153,12 @@ class PacketParser:
149
153
 
150
154
  row = line.split(b"\t")
151
155
  try:
152
- frame_num, request_in, time_epoch, full_uri, full_request, http_file_data = PacketParser.parse_packet_data(row)
156
+ frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
153
157
 
154
- if not full_request and not http_file_data:
158
+ if not full_request:
155
159
  return None
156
160
 
157
- header, file_data = PacketParser.extract_http_file_data(full_request, http_file_data)
161
+ header, file_data = PacketParser.extract_http_file_data(full_request)
158
162
 
159
163
  # row[0] is http.response.code (bytes)
160
164
  is_response = bool(row[0])
@@ -0,0 +1,128 @@
1
+ import os
2
+ import time
3
+ from collections import defaultdict
4
+ from typing import List, Tuple
5
+
6
+ import dpkt
7
+
8
+
9
+ class PcapSplitter:
10
+ """
11
+ Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
12
+ based on TCP flows, dynamically balanced for parallel processing.
13
+ """
14
+
15
+ def __init__(self, pcap_file: str, output_dir: str):
16
+ self.pcap_file = pcap_file
17
+ self.output_dir = output_dir
18
+
19
+ def get_stream_key(self, tcp, ip) -> Tuple:
20
+ """Generate a 5-tuple key for the flow."""
21
+ src = ip.src
22
+ dst = ip.dst
23
+ sport = tcp.sport
24
+ dport = tcp.dport
25
+ # Canonicalize bidirectional flows to the same key
26
+ key1 = (src, dst, sport, dport)
27
+ key2 = (dst, src, dport, sport)
28
+ return key1 if key1 < key2 else key2
29
+
30
+ def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
31
+ """
32
+ Split the pcap file into balanced chunks based on stream volume (bytes).
33
+ Uses a Greedy Partition Algorithm (Longest Processing Time first).
34
+
35
+ Args:
36
+ threshold_mb: File size threshold in MB. If smaller, do not split.
37
+ default_chunks: Number of chunks to split into if threshold is exceeded.
38
+
39
+ Returns:
40
+ List of generated file paths (or original file if not split).
41
+ """
42
+ if not os.path.exists(self.pcap_file):
43
+ raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
44
+
45
+ file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
46
+ if file_size_mb < threshold_mb:
47
+ print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
48
+ return [self.pcap_file]
49
+
50
+ os.makedirs(self.output_dir, exist_ok=True)
51
+
52
+ start_time = time.time()
53
+ # Dictionary to store packets: stream_key -> list of (ts, buf)
54
+ streams = defaultdict(list)
55
+ # Dictionary to store total size: stream_key -> total_bytes
56
+ stream_sizes = defaultdict(int)
57
+
58
+ # 1. Read and Group Packets
59
+ print(f"Reading {self.pcap_file}...")
60
+ with open(self.pcap_file, "rb") as f:
61
+ if self.pcap_file.lower().endswith(".pcapng"):
62
+ reader = dpkt.pcapng.Reader(f)
63
+ else:
64
+ reader = dpkt.pcap.Reader(f)
65
+
66
+ for ts, buf in reader:
67
+ try:
68
+ eth = dpkt.ethernet.Ethernet(buf)
69
+ if not isinstance(eth.data, dpkt.ip.IP):
70
+ continue
71
+ ip = eth.data
72
+ if not isinstance(ip.data, dpkt.tcp.TCP):
73
+ continue
74
+ tcp = ip.data
75
+
76
+ key = self.get_stream_key(tcp, ip)
77
+ streams[key].append((ts, buf))
78
+ stream_sizes[key] += len(buf)
79
+ except Exception:
80
+ continue
81
+
82
+ total_streams = len(streams)
83
+ print(f"Found {total_streams} TCP streams.")
84
+
85
+ if total_streams == 0:
86
+ print("No TCP streams found to split.")
87
+ return []
88
+
89
+ # 2. Assign Streams to Buckets (Greedy LPT Algorithm)
90
+ num_chunks = min(default_chunks, total_streams)
91
+
92
+ # Sort streams by size (descending)
93
+ sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
94
+
95
+ # Buckets: list of (current_size, batch_index, list_of_keys)
96
+ # We perform standard list sort to find min bucket, sufficient for small N
97
+ buckets = [[0, i, []] for i in range(num_chunks)]
98
+
99
+ for key, size in sorted_streams:
100
+ # Find bucket with smallest current size
101
+ buckets.sort(key=lambda x: x[0])
102
+ smallest_bucket = buckets[0]
103
+
104
+ # Add stream to this bucket
105
+ smallest_bucket[0] += size
106
+ smallest_bucket[2].append(key)
107
+
108
+ print(f"Splitting into {num_chunks} files with volume balancing...")
109
+ generated_files = []
110
+
111
+ # 3. Write Batches
112
+ # Sort buckets by index ensures file naming order 0, 1, 2...
113
+ buckets.sort(key=lambda x: x[1])
114
+
115
+ for size, i, batch_keys in buckets:
116
+ out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
117
+ generated_files.append(out_file_path)
118
+
119
+ with open(out_file_path, "wb") as f:
120
+ writer = dpkt.pcap.Writer(f)
121
+ for key in batch_keys:
122
+ for ts, buf in streams[key]:
123
+ writer.writepkt(buf, ts)
124
+
125
+ print(f" - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
126
+
127
+ print(f"Split completed in {time.time() - start_time:.2f}s")
128
+ return generated_files
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.4
3
+ Version: 0.4.6
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -0,0 +1,12 @@
1
+ FlowAnalyzer/FlowAnalyzer.py,sha256=9SshWk5wf0XATI7W4eBiIpzEqeGFyQJs3on5ox-zrNQ,12666
2
+ FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
3
+ FlowAnalyzer/PacketParser.py,sha256=So3iD2ykkWpT0e3aLjBdx_ohoNscD-oAt4bfr_oRqgo,6331
4
+ FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
5
+ FlowAnalyzer/PcapSplitter.py,sha256=0E_vmLYYsE_gD34XTwG1XPx5kBg8ZchJspQEnkBoIdY,4855
6
+ FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
7
+ FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
8
+ flowanalyzer-0.4.6.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
9
+ flowanalyzer-0.4.6.dist-info/METADATA,sha256=j9Bw-2Sr1dx_DatRtxo56WE0BB1-WMOoIhfoSoSYk-Y,6099
10
+ flowanalyzer-0.4.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
11
+ flowanalyzer-0.4.6.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
12
+ flowanalyzer-0.4.6.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- FlowAnalyzer/FlowAnalyzer.py,sha256=GPXZeM1uiLmv_-UKtIwYlfYJ450Etpbtt4V2i_MpLhQ,12721
2
- FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
3
- FlowAnalyzer/PacketParser.py,sha256=fGql84e-tu1PDsXh3NxctKaSh5YeYsJbh5ZCUe6Mo40,6329
4
- FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
5
- FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
6
- FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
7
- flowanalyzer-0.4.4.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
8
- flowanalyzer-0.4.4.dist-info/METADATA,sha256=aUWcp8_ocQIgz0k_3IlhEHrkzsLHYo7XBPqEayIOGc0,6099
9
- flowanalyzer-0.4.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
- flowanalyzer-0.4.4.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
11
- flowanalyzer-0.4.4.dist-info/RECORD,,