FlowAnalyzer 0.4.4__tar.gz → 0.4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -199,9 +199,7 @@ class FlowAnalyzer:
199
199
  "-e",
200
200
  "http.request.full_uri", # 7
201
201
  "-e",
202
- "http.file_data", # 8
203
- "-e",
204
- "tcp.segment.count", # 9
202
+ "tcp.segment.count", # 8
205
203
  "-E",
206
204
  "header=n",
207
205
  "-E",
@@ -9,7 +9,7 @@ from .logging_config import logger
9
9
 
10
10
  class PacketParser:
11
11
  @staticmethod
12
- def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes, bytes]:
12
+ def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
13
13
  """
14
14
  解析 Tshark 输出的一行数据
15
15
  row definition (all bytes):
@@ -20,19 +20,18 @@ class PacketParser:
20
20
  4: tcp.payload
21
21
  5: frame.time_epoch
22
22
  6: exported_pdu.exported_pdu
23
- 7: http.request.full_uri
24
- 8: http.file_data
25
- 9: tcp.segment.count
23
+ 7: http.request.full_uri
24
+ 8: tcp.segment.count
26
25
  """
27
26
  frame_num = int(row[3])
28
27
  request_in = int(row[1]) if row[1] else frame_num
29
28
  # Decode only URI to string
30
29
  full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
31
30
  time_epoch = float(row[5])
32
- http_file_data = row[8] if len(row) > 8 else b""
33
31
 
34
32
  # Logic for Raw Packet (Header Source)
35
- is_reassembled = len(row) > 9 and row[9]
33
+ # Previous index 9 is now 8 since we removed http.file_data
34
+ is_reassembled = len(row) > 8 and row[8]
36
35
 
37
36
  if is_reassembled and row[2]:
38
37
  full_request = row[2]
@@ -42,7 +41,7 @@ class PacketParser:
42
41
  # Fallback (e.g. Exported PDU)
43
42
  full_request = row[2] if row[2] else (row[6] if row[6] else b"")
44
43
 
45
- return frame_num, request_in, time_epoch, full_uri, full_request, http_file_data
44
+ return frame_num, request_in, time_epoch, full_uri, full_request
46
45
 
47
46
  @staticmethod
48
47
  def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
@@ -67,6 +66,10 @@ class PacketParser:
67
66
  while cursor < total_len:
68
67
  newline_idx = file_data.find(b"\n", cursor)
69
68
  if newline_idx == -1:
69
+ # If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
70
+ # But for robustness we might perform a "best effort" or just stop.
71
+ # raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
72
+ # Let's assume non-chunked if strict format not found
70
73
  raise ValueError("Not chunked data")
71
74
 
72
75
  size_line = file_data[cursor:newline_idx].strip()
@@ -74,13 +77,21 @@ class PacketParser:
74
77
  cursor = newline_idx + 1
75
78
  continue
76
79
 
77
- chunk_size = int(size_line, 16)
80
+ try:
81
+ chunk_size = int(size_line, 16)
82
+ except ValueError:
83
+ raise ValueError("Invalid chunk size")
84
+
78
85
  if chunk_size == 0:
79
86
  break
80
87
 
81
88
  data_start = newline_idx + 1
82
89
  data_end = data_start + chunk_size
83
90
 
91
+ # Robustness check
92
+ if data_start > total_len:
93
+ break
94
+
84
95
  if data_end > total_len:
85
96
  chunks.append(file_data[data_start:])
86
97
  break
@@ -88,51 +99,38 @@ class PacketParser:
88
99
  chunks.append(file_data[data_start:data_end])
89
100
 
90
101
  cursor = data_end
102
+ # Skip CRLF after chunk data
91
103
  while cursor < total_len and file_data[cursor] in (13, 10):
92
104
  cursor += 1
93
105
 
94
106
  return b"".join(chunks)
95
107
 
96
108
  @staticmethod
97
- def extract_http_file_data(full_request: bytes, http_file_data: bytes) -> Tuple[bytes, bytes]:
109
+ def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
98
110
  """
99
111
  提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
100
112
  """
101
113
  header = b""
102
114
  file_data = b""
103
115
 
116
+ if not full_request:
117
+ return b"", b""
104
118
  try:
105
- # --- 1. 提取 Header ---
106
- if full_request:
107
- raw_bytes = binascii.unhexlify(full_request)
108
- h_part, _ = PacketParser.split_http_headers(raw_bytes)
109
- header = h_part
110
-
111
- # --- 2. 提取 Body ---
112
- if http_file_data:
113
- try:
114
- file_data = binascii.unhexlify(http_file_data)
115
- return header, file_data
116
- except binascii.Error:
117
- logger.warning("解析 http.file_data Hex 失败,尝试回退到原始方式")
118
-
119
- # --- 3. 回退模式 (Fallback) ---
120
- if full_request and not file_data:
121
- raw_bytes = binascii.unhexlify(full_request)
122
- _, body_part = PacketParser.split_http_headers(raw_bytes)
123
-
124
- with contextlib.suppress(Exception):
125
- body_part = PacketParser.dechunk_http_response(body_part)
126
-
127
- with contextlib.suppress(Exception):
128
- if body_part.startswith(b"\x1f\x8b"):
129
- body_part = gzip.decompress(body_part)
130
-
131
- file_data = body_part
119
+ raw_bytes = binascii.unhexlify(full_request)
120
+ header, body_part = PacketParser.split_http_headers(raw_bytes)
121
+
122
+ with contextlib.suppress(Exception):
123
+ body_part = PacketParser.dechunk_http_response(body_part)
124
+
125
+ with contextlib.suppress(Exception):
126
+ if body_part.startswith(b"\x1f\x8b"):
127
+ body_part = gzip.decompress(body_part)
128
+
129
+ file_data = body_part
132
130
  return header, file_data
133
131
 
134
- except ValueError as e:
135
- logger.error(f"Hex转换失败: {str(e)[:100]}...")
132
+ except binascii.Error:
133
+ logger.error("Hex转换失败")
136
134
  return b"", b""
137
135
  except Exception as e:
138
136
  logger.error(f"解析HTTP数据未知错误: {e}")
@@ -149,12 +147,12 @@ class PacketParser:
149
147
 
150
148
  row = line.split(b"\t")
151
149
  try:
152
- frame_num, request_in, time_epoch, full_uri, full_request, http_file_data = PacketParser.parse_packet_data(row)
150
+ frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
153
151
 
154
- if not full_request and not http_file_data:
152
+ if not full_request:
155
153
  return None
156
154
 
157
- header, file_data = PacketParser.extract_http_file_data(full_request, http_file_data)
155
+ header, file_data = PacketParser.extract_http_file_data(full_request)
158
156
 
159
157
  # row[0] is http.response.code (bytes)
160
158
  is_response = bool(row[0])
@@ -0,0 +1,128 @@
1
+ import os
2
+ import time
3
+ from collections import defaultdict
4
+ from typing import List, Tuple
5
+
6
+ import dpkt
7
+
8
+
9
+ class PcapSplitter:
10
+ """
11
+ Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
12
+ based on TCP flows, dynamically balanced for parallel processing.
13
+ """
14
+
15
+ def __init__(self, pcap_file: str, output_dir: str):
16
+ self.pcap_file = pcap_file
17
+ self.output_dir = output_dir
18
+
19
+ def get_stream_key(self, tcp, ip) -> Tuple:
20
+ """Generate a 5-tuple key for the flow."""
21
+ src = ip.src
22
+ dst = ip.dst
23
+ sport = tcp.sport
24
+ dport = tcp.dport
25
+ # Canonicalize bidirectional flows to the same key
26
+ key1 = (src, dst, sport, dport)
27
+ key2 = (dst, src, dport, sport)
28
+ return key1 if key1 < key2 else key2
29
+
30
+ def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
31
+ """
32
+ Split the pcap file into balanced chunks based on stream volume (bytes).
33
+ Uses a Greedy Partition Algorithm (Longest Processing Time first).
34
+
35
+ Args:
36
+ threshold_mb: File size threshold in MB. If smaller, do not split.
37
+ default_chunks: Number of chunks to split into if threshold is exceeded.
38
+
39
+ Returns:
40
+ List of generated file paths (or original file if not split).
41
+ """
42
+ if not os.path.exists(self.pcap_file):
43
+ raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
44
+
45
+ file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
46
+ if file_size_mb < threshold_mb:
47
+ print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
48
+ return [self.pcap_file]
49
+
50
+ os.makedirs(self.output_dir, exist_ok=True)
51
+
52
+ start_time = time.time()
53
+ # Dictionary to store packets: stream_key -> list of (ts, buf)
54
+ streams = defaultdict(list)
55
+ # Dictionary to store total size: stream_key -> total_bytes
56
+ stream_sizes = defaultdict(int)
57
+
58
+ # 1. Read and Group Packets
59
+ print(f"Reading {self.pcap_file}...")
60
+ with open(self.pcap_file, "rb") as f:
61
+ if self.pcap_file.lower().endswith(".pcapng"):
62
+ reader = dpkt.pcapng.Reader(f)
63
+ else:
64
+ reader = dpkt.pcap.Reader(f)
65
+
66
+ for ts, buf in reader:
67
+ try:
68
+ eth = dpkt.ethernet.Ethernet(buf)
69
+ if not isinstance(eth.data, dpkt.ip.IP):
70
+ continue
71
+ ip = eth.data
72
+ if not isinstance(ip.data, dpkt.tcp.TCP):
73
+ continue
74
+ tcp = ip.data
75
+
76
+ key = self.get_stream_key(tcp, ip)
77
+ streams[key].append((ts, buf))
78
+ stream_sizes[key] += len(buf)
79
+ except Exception:
80
+ continue
81
+
82
+ total_streams = len(streams)
83
+ print(f"Found {total_streams} TCP streams.")
84
+
85
+ if total_streams == 0:
86
+ print("No TCP streams found to split.")
87
+ return []
88
+
89
+ # 2. Assign Streams to Buckets (Greedy LPT Algorithm)
90
+ num_chunks = min(default_chunks, total_streams)
91
+
92
+ # Sort streams by size (descending)
93
+ sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
94
+
95
+ # Buckets: list of (current_size, batch_index, list_of_keys)
96
+ # We perform standard list sort to find min bucket, sufficient for small N
97
+ buckets = [[0, i, []] for i in range(num_chunks)]
98
+
99
+ for key, size in sorted_streams:
100
+ # Find bucket with smallest current size
101
+ buckets.sort(key=lambda x: x[0])
102
+ smallest_bucket = buckets[0]
103
+
104
+ # Add stream to this bucket
105
+ smallest_bucket[0] += size
106
+ smallest_bucket[2].append(key)
107
+
108
+ print(f"Splitting into {num_chunks} files with volume balancing...")
109
+ generated_files = []
110
+
111
+ # 3. Write Batches
112
+ # Sort buckets by index ensures file naming order 0, 1, 2...
113
+ buckets.sort(key=lambda x: x[1])
114
+
115
+ for size, i, batch_keys in buckets:
116
+ out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
117
+ generated_files.append(out_file_path)
118
+
119
+ with open(out_file_path, "wb") as f:
120
+ writer = dpkt.pcap.Writer(f)
121
+ for key in batch_keys:
122
+ for ts, buf in streams[key]:
123
+ writer.writepkt(buf, ts)
124
+
125
+ print(f" - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
126
+
127
+ print(f"Split completed in {time.time() - start_time:.2f}s")
128
+ return generated_files
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.4
3
+ Version: 0.4.5
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -5,9 +5,13 @@ FlowAnalyzer/FlowAnalyzer.py
5
5
  FlowAnalyzer/Models.py
6
6
  FlowAnalyzer/PacketParser.py
7
7
  FlowAnalyzer/Path.py
8
+ FlowAnalyzer/PcapSplitter.py
8
9
  FlowAnalyzer/__init__.py
9
10
  FlowAnalyzer/logging_config.py
10
11
  FlowAnalyzer.egg-info/PKG-INFO
11
12
  FlowAnalyzer.egg-info/SOURCES.txt
12
13
  FlowAnalyzer.egg-info/dependency_links.txt
13
- FlowAnalyzer.egg-info/top_level.txt
14
+ FlowAnalyzer.egg-info/top_level.txt
15
+ tests/test.py
16
+ tests/test_parser.py
17
+ tests/test_split.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.4
3
+ Version: 0.4.5
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -7,7 +7,7 @@ with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8"
7
7
 
8
8
  setup(
9
9
  name="FlowAnalyzer",
10
- version="0.4.4",
10
+ version="0.4.5",
11
11
  description="FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件",
12
12
  author="Byxs20",
13
13
  author_email="97766819@qq.com",
@@ -0,0 +1,48 @@
1
+ import os
2
+
3
+ from viztracer import VizTracer
4
+
5
+ from FlowAnalyzer.FlowAnalyzer import FlowAnalyzer
6
+
7
+ # ============================
8
+ # 配置区域
9
+ # ============================
10
+ PCAP_FILE = "./tests/Beyond_Pro.pcapng" # 你的测试 pcap 文件路径
11
+ DISPLAY_FILTER = "http" # tshark display filter, 可以根据需求改
12
+
13
+
14
+ # ============================
15
+ # 测试逻辑
16
+ # ============================
17
+ def main():
18
+ if not os.path.exists(PCAP_FILE):
19
+ print(f"[ERROR] 流量包不存在: {PCAP_FILE}")
20
+ return
21
+
22
+ print("[*] 开始解析 PCAP 文件...")
23
+ with VizTracer():
24
+ db_path = FlowAnalyzer.get_db_data(PCAP_FILE, DISPLAY_FILTER)
25
+ print(f"[*] 解析完成,数据库生成: {db_path}")
26
+
27
+ print("[*] 遍历 HTTP 请求-响应对:")
28
+ analyzer = FlowAnalyzer(db_path)
29
+ total = 0
30
+ requests_count = 0
31
+ responses_count = 0
32
+
33
+ for pair in analyzer.generate_http_dict_pairs():
34
+ total += 1
35
+ if pair.request:
36
+ requests_count += 1
37
+ if pair.response:
38
+ responses_count += 1
39
+
40
+ print(f"[*] 总记录数: {total}")
41
+ print(f"[*] 请求数量: {requests_count}")
42
+ print(f"[*] 响应数量: {responses_count}")
43
+
44
+ print("[*] 测试完成 ✅")
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
@@ -0,0 +1,47 @@
1
+ import binascii
2
+ import gzip
3
+ import unittest
4
+
5
+ from FlowAnalyzer.PacketParser import PacketParser
6
+
7
+
8
+ class TestPacketParserOptimization(unittest.TestCase):
9
+ def test_gzip_decompression(self):
10
+ # Construct a fake HTTP response with GZIP body
11
+ content = b"Hello, Gzip World!"
12
+ compressed = gzip.compress(content)
13
+ header = b"HTTP/1.1 200 OK\r\nContent-Encoding: gzip\r\n\r\n"
14
+ full_response = header + compressed
15
+
16
+ full_request_hex = binascii.hexlify(full_response)
17
+
18
+ # Test extract_http_file_data
19
+ extracted_header, extracted_body = PacketParser.extract_http_file_data(full_request_hex)
20
+
21
+ self.assertEqual(extracted_header, header)
22
+ self.assertEqual(extracted_body, content)
23
+
24
+ def test_basic_extraction(self):
25
+ # Case: Simple text body, no chunking
26
+ content = b"Simple Body"
27
+ header = b"HTTP/1.1 200 OK\r\n\r\n"
28
+ full_response = header + content
29
+ full_request_hex = binascii.hexlify(full_response)
30
+
31
+ extracted_header, extracted_body = PacketParser.extract_http_file_data(full_request_hex)
32
+ self.assertEqual(extracted_body, content)
33
+
34
+ def test_chunked_decoding(self):
35
+ # Case: Chunked body
36
+ # 5\r\nHello\r\n0\r\n\r\n
37
+ chunked_body = b"5\r\nHello\r\n0\r\n\r\n"
38
+ header = b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n"
39
+ full_response = header + chunked_body
40
+ full_request_hex = binascii.hexlify(full_response)
41
+
42
+ extracted_header, extracted_body = PacketParser.extract_http_file_data(full_request_hex)
43
+ self.assertEqual(extracted_body, b"Hello")
44
+
45
+
46
+ if __name__ == "__main__":
47
+ unittest.main()
@@ -0,0 +1,90 @@
1
+
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+
6
+ from FlowAnalyzer.PcapSplitter import PcapSplitter
7
+
8
+ #############################
9
+ # 配置区
10
+ #############################
11
+ PCAP_FILE = r"./tests/Beyond_Pro.pcapng" # 修改为你的文件
12
+ OUT_DIR = "output"
13
+ #############################
14
+
15
+ def clean_output_dir(directory: str):
16
+ if os.path.exists(directory):
17
+ print(f"Cleaning output directory: {directory}")
18
+ shutil.rmtree(directory)
19
+ os.makedirs(directory, exist_ok=True)
20
+
21
+ def count_packets(pcap_path: str, display_filter: str) -> int:
22
+ cmd = [
23
+ "tshark",
24
+ "-r", pcap_path,
25
+ "-Y", display_filter,
26
+ "-T", "fields",
27
+ "-e", "frame.number"
28
+ ]
29
+ try:
30
+ # Run tshark and capture output
31
+ result = subprocess.run(
32
+ cmd,
33
+ capture_output=True,
34
+ text=True,
35
+ check=True
36
+ )
37
+ # Count non-empty lines
38
+ count = sum(1 for line in result.stdout.splitlines() if line.strip())
39
+ return count
40
+ except subprocess.CalledProcessError as e:
41
+ print(f"Error running tshark on {pcap_path}: {e}")
42
+ return 0
43
+ except FileNotFoundError:
44
+ print("Error: tshark not found in PATH.")
45
+ return 0
46
+
47
+ def main():
48
+ print("Beginning split test...")
49
+
50
+ # 1. Clean output directory
51
+ clean_output_dir(OUT_DIR)
52
+
53
+ splitter = PcapSplitter(PCAP_FILE, OUT_DIR)
54
+
55
+ # Defaults to os.cpu_count() chunks
56
+ result_files = splitter.split()
57
+
58
+ print(f"\nGenerated {len(result_files)} files:")
59
+ for f in result_files:
60
+ print(f)
61
+
62
+ # 2. Verify with Tshark
63
+ print("\nVerifying data integrity with Tshark...")
64
+ total_requests = 0
65
+ total_responses = 0
66
+
67
+ EXPECTED_REQUESTS = 12284
68
+ EXPECTED_RESPONSES = 12281
69
+
70
+ for pcap in result_files:
71
+ req_count = count_packets(pcap, "http.request")
72
+ resp_count = count_packets(pcap, "http.response")
73
+
74
+ print(f" {os.path.basename(pcap)}: Requests={req_count}, Responses={resp_count}")
75
+ total_requests += req_count
76
+ total_responses += resp_count
77
+
78
+ print("-" * 40)
79
+ print(f"Total Requests: {total_requests} (Expected: {EXPECTED_REQUESTS})")
80
+ print(f"Total Responses: {total_responses} (Expected: {EXPECTED_RESPONSES})")
81
+
82
+ if total_requests == EXPECTED_REQUESTS and total_responses == EXPECTED_RESPONSES:
83
+ print("\nSUCCESS: Data integrity verified.")
84
+ else:
85
+ print("\nFAILURE: Data integrity mismatch!")
86
+ exit(1)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
File without changes
File without changes
File without changes