FlowAnalyzer 0.4.3__tar.gz → 0.4.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowanalyzer-0.4.5/FlowAnalyzer/FlowAnalyzer.py +303 -0
- flowanalyzer-0.4.5/FlowAnalyzer/Models.py +27 -0
- flowanalyzer-0.4.5/FlowAnalyzer/PacketParser.py +183 -0
- flowanalyzer-0.4.5/FlowAnalyzer/PcapSplitter.py +128 -0
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer/logging_config.py +2 -1
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer.egg-info/PKG-INFO +19 -9
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer.egg-info/SOURCES.txt +7 -1
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/PKG-INFO +19 -9
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/README.md +9 -7
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/setup.py +1 -1
- flowanalyzer-0.4.5/tests/test.py +48 -0
- flowanalyzer-0.4.5/tests/test_parser.py +47 -0
- flowanalyzer-0.4.5/tests/test_split.py +90 -0
- flowanalyzer-0.4.3/FlowAnalyzer/FlowAnalyzer.py +0 -481
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer/Path.py +0 -0
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer/__init__.py +0 -0
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer.egg-info/dependency_links.txt +0 -0
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/FlowAnalyzer.egg-info/top_level.txt +0 -0
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/LICENSE +0 -0
- {flowanalyzer-0.4.3 → flowanalyzer-0.4.5}/setup.cfg +0 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
import subprocess
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from .logging_config import logger
|
|
8
|
+
from .Models import HttpPair, Request, Response
|
|
9
|
+
from .PacketParser import PacketParser
|
|
10
|
+
from .Path import get_default_tshark_path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FlowAnalyzer:
|
|
14
|
+
"""
|
|
15
|
+
FlowAnalyzer 流量分析器 (智能缓存版)
|
|
16
|
+
特点:
|
|
17
|
+
1. Tshark -> Pipe -> ThreadPool -> SQLite
|
|
18
|
+
2. 智能校验:自动比对 Filter 和文件修改时间,防止缓存错乱
|
|
19
|
+
3. 存储优化:数据库文件生成在流量包同级目录下
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, db_path: str):
|
|
23
|
+
"""
|
|
24
|
+
初始化 FlowAnalyzer
|
|
25
|
+
:param db_path: 数据库文件路径 (由 get_json_data 返回)
|
|
26
|
+
"""
|
|
27
|
+
# 路径兼容处理
|
|
28
|
+
if db_path.endswith(".json"):
|
|
29
|
+
possible_db = db_path + ".db"
|
|
30
|
+
if os.path.exists(possible_db):
|
|
31
|
+
self.db_path = possible_db
|
|
32
|
+
else:
|
|
33
|
+
self.db_path = db_path
|
|
34
|
+
else:
|
|
35
|
+
self.db_path = db_path
|
|
36
|
+
|
|
37
|
+
self.check_db_file()
|
|
38
|
+
|
|
39
|
+
def check_db_file(self):
|
|
40
|
+
"""检查数据库文件是否存在"""
|
|
41
|
+
if not os.path.exists(self.db_path):
|
|
42
|
+
raise FileNotFoundError(f"未找到数据文件或缓存数据库: {self.db_path},请先调用 get_json_data 生成。")
|
|
43
|
+
|
|
44
|
+
def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
|
|
45
|
+
"""生成HTTP请求和响应信息的字典对 (SQL JOIN 高性能版)"""
|
|
46
|
+
if not os.path.exists(self.db_path):
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
50
|
+
cursor = conn.cursor()
|
|
51
|
+
# 开启查询优化
|
|
52
|
+
cursor.execute("PRAGMA query_only = 1;")
|
|
53
|
+
|
|
54
|
+
# === 第一步:配对查询 ===
|
|
55
|
+
# 利用 SQLite 的 LEFT JOIN 直接匹配请求和响应
|
|
56
|
+
sql_pair = """
|
|
57
|
+
SELECT
|
|
58
|
+
req.frame_num, req.header, req.file_data, req.full_uri, req.time_epoch, -- 0-4 (Request)
|
|
59
|
+
resp.frame_num, resp.header, resp.file_data, resp.time_epoch, resp.request_in -- 5-9 (Response)
|
|
60
|
+
FROM requests req
|
|
61
|
+
LEFT JOIN responses resp ON req.frame_num = resp.request_in
|
|
62
|
+
ORDER BY req.frame_num ASC
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
cursor.execute(sql_pair)
|
|
66
|
+
|
|
67
|
+
# 流式遍历结果,内存占用极低
|
|
68
|
+
for row in cursor:
|
|
69
|
+
req = Request(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", full_uri=row[3] or "", time_epoch=row[4])
|
|
70
|
+
|
|
71
|
+
resp = None
|
|
72
|
+
if row[5] is not None:
|
|
73
|
+
resp = Response(frame_num=row[5], header=row[6] or b"", file_data=row[7] or b"", time_epoch=row[8], _request_in=row[9])
|
|
74
|
+
|
|
75
|
+
yield HttpPair(request=req, response=resp)
|
|
76
|
+
|
|
77
|
+
# === 第二步:孤儿响应查询 ===
|
|
78
|
+
sql_orphan = """
|
|
79
|
+
SELECT frame_num, header, file_data, time_epoch, request_in
|
|
80
|
+
FROM responses
|
|
81
|
+
WHERE request_in NOT IN (SELECT frame_num FROM requests)
|
|
82
|
+
"""
|
|
83
|
+
cursor.execute(sql_orphan)
|
|
84
|
+
|
|
85
|
+
for row in cursor:
|
|
86
|
+
resp = Response(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", time_epoch=row[3], _request_in=row[4])
|
|
87
|
+
yield HttpPair(request=None, response=resp)
|
|
88
|
+
|
|
89
|
+
# =========================================================================
|
|
90
|
+
# 静态方法区域:包含校验逻辑和流式处理
|
|
91
|
+
# =========================================================================
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
|
|
95
|
+
"""
|
|
96
|
+
获取数据路径 (智能校验版)。
|
|
97
|
+
"""
|
|
98
|
+
if not os.path.exists(file_path):
|
|
99
|
+
raise FileNotFoundError("流量包路径不存在:%s" % file_path)
|
|
100
|
+
|
|
101
|
+
abs_file_path = os.path.abspath(file_path)
|
|
102
|
+
pcap_dir = os.path.dirname(abs_file_path)
|
|
103
|
+
base_name = os.path.splitext(os.path.basename(abs_file_path))[0]
|
|
104
|
+
db_path = os.path.join(pcap_dir, f"{base_name}.db")
|
|
105
|
+
|
|
106
|
+
if FlowAnalyzer._is_cache_valid(db_path, abs_file_path, display_filter):
|
|
107
|
+
logger.debug(f"缓存校验通过 (Filter匹配且文件未变),使用缓存: [{db_path}]")
|
|
108
|
+
return db_path
|
|
109
|
+
else:
|
|
110
|
+
logger.debug(f"缓存失效或不存在 (Filter变更或文件更新),开始重新解析...")
|
|
111
|
+
|
|
112
|
+
tshark_path = FlowAnalyzer.get_tshark_path(tshark_path)
|
|
113
|
+
FlowAnalyzer._stream_tshark_to_db(abs_file_path, display_filter, tshark_path, db_path)
|
|
114
|
+
|
|
115
|
+
return db_path
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def get_db_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
|
|
119
|
+
return FlowAnalyzer.get_json_data(file_path, display_filter, tshark_path)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _is_cache_valid(db_path: str, pcap_path: str, current_filter: str) -> bool:
|
|
123
|
+
if not os.path.exists(db_path) or os.path.getsize(db_path) == 0:
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
current_mtime = os.path.getmtime(pcap_path)
|
|
128
|
+
current_size = os.path.getsize(pcap_path)
|
|
129
|
+
|
|
130
|
+
with sqlite3.connect(db_path) as conn:
|
|
131
|
+
cursor = conn.cursor()
|
|
132
|
+
cursor.execute("SELECT filter, pcap_mtime, pcap_size FROM meta_info LIMIT 1")
|
|
133
|
+
row = cursor.fetchone()
|
|
134
|
+
|
|
135
|
+
if not row:
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
cached_filter, cached_mtime, cached_size = row
|
|
139
|
+
|
|
140
|
+
if cached_filter == current_filter and cached_size == current_size and abs(cached_mtime - current_mtime) < 0.1:
|
|
141
|
+
return True
|
|
142
|
+
else:
|
|
143
|
+
logger.debug(f"校验失败: 缓存Filter={cached_filter} vs 当前={current_filter}")
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
except sqlite3.OperationalError:
|
|
147
|
+
return False
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning(f"缓存校验出错: {e},将重新解析")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
|
|
154
|
+
"""流式解析并存入DB (多线程版)"""
|
|
155
|
+
if os.path.exists(db_path):
|
|
156
|
+
os.remove(db_path)
|
|
157
|
+
|
|
158
|
+
with sqlite3.connect(db_path) as conn:
|
|
159
|
+
cursor = conn.cursor()
|
|
160
|
+
cursor.execute("PRAGMA synchronous = OFF")
|
|
161
|
+
cursor.execute("PRAGMA journal_mode = MEMORY")
|
|
162
|
+
|
|
163
|
+
cursor.execute("CREATE TABLE requests (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, full_uri TEXT, time_epoch REAL)")
|
|
164
|
+
cursor.execute("CREATE TABLE responses (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, time_epoch REAL, request_in INTEGER)")
|
|
165
|
+
|
|
166
|
+
cursor.execute("""
|
|
167
|
+
CREATE TABLE meta_info (
|
|
168
|
+
id INTEGER PRIMARY KEY,
|
|
169
|
+
filter TEXT,
|
|
170
|
+
pcap_path TEXT,
|
|
171
|
+
pcap_mtime REAL,
|
|
172
|
+
pcap_size INTEGER
|
|
173
|
+
)
|
|
174
|
+
""")
|
|
175
|
+
conn.commit()
|
|
176
|
+
|
|
177
|
+
command = [
|
|
178
|
+
tshark_path,
|
|
179
|
+
"-r",
|
|
180
|
+
pcap_path,
|
|
181
|
+
"-Y",
|
|
182
|
+
f"({display_filter})",
|
|
183
|
+
"-T",
|
|
184
|
+
"fields",
|
|
185
|
+
"-e",
|
|
186
|
+
"http.response.code", # 0
|
|
187
|
+
"-e",
|
|
188
|
+
"http.request_in", # 1
|
|
189
|
+
"-e",
|
|
190
|
+
"tcp.reassembled.data", # 2
|
|
191
|
+
"-e",
|
|
192
|
+
"frame.number", # 3
|
|
193
|
+
"-e",
|
|
194
|
+
"tcp.payload", # 4
|
|
195
|
+
"-e",
|
|
196
|
+
"frame.time_epoch", # 5
|
|
197
|
+
"-e",
|
|
198
|
+
"exported_pdu.exported_pdu", # 6
|
|
199
|
+
"-e",
|
|
200
|
+
"http.request.full_uri", # 7
|
|
201
|
+
"-e",
|
|
202
|
+
"tcp.segment.count", # 8
|
|
203
|
+
"-E",
|
|
204
|
+
"header=n",
|
|
205
|
+
"-E",
|
|
206
|
+
"separator=/t",
|
|
207
|
+
"-E",
|
|
208
|
+
"quote=n",
|
|
209
|
+
"-E",
|
|
210
|
+
"occurrence=f",
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
logger.debug(f"执行 Tshark: {command}")
|
|
214
|
+
BATCH_SIZE = 2000
|
|
215
|
+
MAX_PENDING_BATCHES = 20 # 控制内存中待处理的批次数量 (Backpressure)
|
|
216
|
+
|
|
217
|
+
# 使用 ThreadPoolExecutor 并行处理数据
|
|
218
|
+
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
|
219
|
+
|
|
220
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)))
|
|
221
|
+
try:
|
|
222
|
+
with sqlite3.connect(db_path) as conn:
|
|
223
|
+
cursor = conn.cursor()
|
|
224
|
+
|
|
225
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
226
|
+
current_batch = []
|
|
227
|
+
pending_futures = [] # List[Future]
|
|
228
|
+
|
|
229
|
+
def write_results_to_db(results):
|
|
230
|
+
"""将一批处理好的结果写入数据库"""
|
|
231
|
+
if not results:
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
db_req_rows = []
|
|
235
|
+
db_resp_rows = []
|
|
236
|
+
|
|
237
|
+
for item in results:
|
|
238
|
+
if item["type"] == "response":
|
|
239
|
+
db_resp_rows.append((item["frame_num"], item["header"], item["file_data"], item["time_epoch"], item["request_in"]))
|
|
240
|
+
else:
|
|
241
|
+
db_req_rows.append((item["frame_num"], item["header"], item["file_data"], item["full_uri"], item["time_epoch"]))
|
|
242
|
+
|
|
243
|
+
if db_req_rows:
|
|
244
|
+
cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
|
|
245
|
+
if db_resp_rows:
|
|
246
|
+
cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
|
|
247
|
+
|
|
248
|
+
def submit_batch():
|
|
249
|
+
"""提交当前批次到线程池"""
|
|
250
|
+
if not current_batch:
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
# Copy batch data for the thread (list slicing is fast)
|
|
254
|
+
batch_data = current_batch[:]
|
|
255
|
+
future = executor.submit(PacketParser.process_batch, batch_data)
|
|
256
|
+
pending_futures.append(future)
|
|
257
|
+
current_batch.clear()
|
|
258
|
+
|
|
259
|
+
# --- Main Pipeline Loop ---
|
|
260
|
+
if process.stdout:
|
|
261
|
+
for line in process.stdout:
|
|
262
|
+
current_batch.append(line)
|
|
263
|
+
|
|
264
|
+
if len(current_batch) >= BATCH_SIZE:
|
|
265
|
+
submit_batch()
|
|
266
|
+
|
|
267
|
+
# Backpressure: 如果积压的任务太多,主线程暂停读取,先处理掉最早的一个
|
|
268
|
+
# 这样既保证了 Pipeline 流动,又防止内存爆掉
|
|
269
|
+
if len(pending_futures) >= MAX_PENDING_BATCHES:
|
|
270
|
+
oldest_future = pending_futures.pop(0)
|
|
271
|
+
write_results_to_db(oldest_future.result())
|
|
272
|
+
|
|
273
|
+
# --- Drain Pipeline ---
|
|
274
|
+
# 提交剩余数据
|
|
275
|
+
submit_batch()
|
|
276
|
+
|
|
277
|
+
# 等待所有剩余任务完成
|
|
278
|
+
for future in pending_futures:
|
|
279
|
+
write_results_to_db(future.result())
|
|
280
|
+
|
|
281
|
+
# 创建索引和元数据
|
|
282
|
+
cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
|
|
283
|
+
pcap_mtime = os.path.getmtime(pcap_path)
|
|
284
|
+
pcap_size = os.path.getsize(pcap_path)
|
|
285
|
+
cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
|
|
286
|
+
conn.commit()
|
|
287
|
+
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.error(f"解析错误: {e}")
|
|
290
|
+
if process.poll() is None:
|
|
291
|
+
process.terminate()
|
|
292
|
+
finally:
|
|
293
|
+
if process.poll() is None:
|
|
294
|
+
process.terminate()
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_tshark_path(tshark_path: Optional[str]) -> str:
|
|
298
|
+
default_tshark_path = get_default_tshark_path()
|
|
299
|
+
use_path = tshark_path if tshark_path and os.path.exists(tshark_path) else default_tshark_path
|
|
300
|
+
if not use_path or not os.path.exists(use_path):
|
|
301
|
+
logger.critical("未找到 Tshark,请检查路径配置")
|
|
302
|
+
exit(-1)
|
|
303
|
+
return use_path
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import NamedTuple, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Request:
|
|
7
|
+
__slots__ = ("frame_num", "header", "file_data", "full_uri", "time_epoch")
|
|
8
|
+
frame_num: int
|
|
9
|
+
header: bytes
|
|
10
|
+
file_data: bytes
|
|
11
|
+
full_uri: str
|
|
12
|
+
time_epoch: float
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Response:
|
|
17
|
+
__slots__ = ("frame_num", "header", "file_data", "time_epoch", "_request_in")
|
|
18
|
+
frame_num: int
|
|
19
|
+
header: bytes
|
|
20
|
+
file_data: bytes
|
|
21
|
+
time_epoch: float
|
|
22
|
+
_request_in: Optional[int]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HttpPair(NamedTuple):
|
|
26
|
+
request: Optional[Request]
|
|
27
|
+
response: Optional[Response]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import binascii
|
|
2
|
+
import contextlib
|
|
3
|
+
import gzip
|
|
4
|
+
from typing import List, Optional, Tuple
|
|
5
|
+
from urllib import parse
|
|
6
|
+
|
|
7
|
+
from .logging_config import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PacketParser:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
|
|
13
|
+
"""
|
|
14
|
+
解析 Tshark 输出的一行数据
|
|
15
|
+
row definition (all bytes):
|
|
16
|
+
0: http.response.code
|
|
17
|
+
1: http.request_in
|
|
18
|
+
2: tcp.reassembled.data
|
|
19
|
+
3: frame.number
|
|
20
|
+
4: tcp.payload
|
|
21
|
+
5: frame.time_epoch
|
|
22
|
+
6: exported_pdu.exported_pdu
|
|
23
|
+
7: http.request.full_uri
|
|
24
|
+
8: tcp.segment.count
|
|
25
|
+
"""
|
|
26
|
+
frame_num = int(row[3])
|
|
27
|
+
request_in = int(row[1]) if row[1] else frame_num
|
|
28
|
+
# Decode only URI to string
|
|
29
|
+
full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
|
|
30
|
+
time_epoch = float(row[5])
|
|
31
|
+
|
|
32
|
+
# Logic for Raw Packet (Header Source)
|
|
33
|
+
# Previous index 9 is now 8 since we removed http.file_data
|
|
34
|
+
is_reassembled = len(row) > 8 and row[8]
|
|
35
|
+
|
|
36
|
+
if is_reassembled and row[2]:
|
|
37
|
+
full_request = row[2]
|
|
38
|
+
elif row[4]:
|
|
39
|
+
full_request = row[4]
|
|
40
|
+
else:
|
|
41
|
+
# Fallback (e.g. Exported PDU)
|
|
42
|
+
full_request = row[2] if row[2] else (row[6] if row[6] else b"")
|
|
43
|
+
|
|
44
|
+
return frame_num, request_in, time_epoch, full_uri, full_request
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
|
|
48
|
+
headerEnd = file_data.find(b"\r\n\r\n")
|
|
49
|
+
if headerEnd != -1:
|
|
50
|
+
return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
|
|
51
|
+
elif file_data.find(b"\n\n") != -1:
|
|
52
|
+
headerEnd = file_data.index(b"\n\n") + 2
|
|
53
|
+
return file_data[:headerEnd], file_data[headerEnd:]
|
|
54
|
+
return b"", file_data
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def dechunk_http_response(file_data: bytes) -> bytes:
|
|
58
|
+
"""解码分块TCP数据"""
|
|
59
|
+
if not file_data:
|
|
60
|
+
return b""
|
|
61
|
+
|
|
62
|
+
chunks = []
|
|
63
|
+
cursor = 0
|
|
64
|
+
total_len = len(file_data)
|
|
65
|
+
|
|
66
|
+
while cursor < total_len:
|
|
67
|
+
newline_idx = file_data.find(b"\n", cursor)
|
|
68
|
+
if newline_idx == -1:
|
|
69
|
+
# If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
|
|
70
|
+
# But for robustness we might perform a "best effort" or just stop.
|
|
71
|
+
# raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
|
|
72
|
+
# Let's assume non-chunked if strict format not found
|
|
73
|
+
raise ValueError("Not chunked data")
|
|
74
|
+
|
|
75
|
+
size_line = file_data[cursor:newline_idx].strip()
|
|
76
|
+
if not size_line:
|
|
77
|
+
cursor = newline_idx + 1
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
chunk_size = int(size_line, 16)
|
|
82
|
+
except ValueError:
|
|
83
|
+
raise ValueError("Invalid chunk size")
|
|
84
|
+
|
|
85
|
+
if chunk_size == 0:
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
data_start = newline_idx + 1
|
|
89
|
+
data_end = data_start + chunk_size
|
|
90
|
+
|
|
91
|
+
# Robustness check
|
|
92
|
+
if data_start > total_len:
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
if data_end > total_len:
|
|
96
|
+
chunks.append(file_data[data_start:])
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
chunks.append(file_data[data_start:data_end])
|
|
100
|
+
|
|
101
|
+
cursor = data_end
|
|
102
|
+
# Skip CRLF after chunk data
|
|
103
|
+
while cursor < total_len and file_data[cursor] in (13, 10):
|
|
104
|
+
cursor += 1
|
|
105
|
+
|
|
106
|
+
return b"".join(chunks)
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
|
|
110
|
+
"""
|
|
111
|
+
提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
|
|
112
|
+
"""
|
|
113
|
+
header = b""
|
|
114
|
+
file_data = b""
|
|
115
|
+
|
|
116
|
+
if not full_request:
|
|
117
|
+
return b"", b""
|
|
118
|
+
try:
|
|
119
|
+
raw_bytes = binascii.unhexlify(full_request)
|
|
120
|
+
header, body_part = PacketParser.split_http_headers(raw_bytes)
|
|
121
|
+
|
|
122
|
+
with contextlib.suppress(Exception):
|
|
123
|
+
body_part = PacketParser.dechunk_http_response(body_part)
|
|
124
|
+
|
|
125
|
+
with contextlib.suppress(Exception):
|
|
126
|
+
if body_part.startswith(b"\x1f\x8b"):
|
|
127
|
+
body_part = gzip.decompress(body_part)
|
|
128
|
+
|
|
129
|
+
file_data = body_part
|
|
130
|
+
return header, file_data
|
|
131
|
+
|
|
132
|
+
except binascii.Error:
|
|
133
|
+
logger.error("Hex转换失败")
|
|
134
|
+
return b"", b""
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"解析HTTP数据未知错误: {e}")
|
|
137
|
+
return b"", b""
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def process_row(line: bytes) -> Optional[dict]:
|
|
141
|
+
"""
|
|
142
|
+
处理单行数据,返回结构化结果供主线程写入
|
|
143
|
+
"""
|
|
144
|
+
line = line.rstrip(b"\r\n")
|
|
145
|
+
if not line:
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
row = line.split(b"\t")
|
|
149
|
+
try:
|
|
150
|
+
frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
|
|
151
|
+
|
|
152
|
+
if not full_request:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
header, file_data = PacketParser.extract_http_file_data(full_request)
|
|
156
|
+
|
|
157
|
+
# row[0] is http.response.code (bytes)
|
|
158
|
+
is_response = bool(row[0])
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"type": "response" if is_response else "request",
|
|
162
|
+
"frame_num": frame_num,
|
|
163
|
+
"header": header,
|
|
164
|
+
"file_data": file_data,
|
|
165
|
+
"time_epoch": time_epoch,
|
|
166
|
+
"request_in": request_in, # Only useful for Response
|
|
167
|
+
"full_uri": full_uri, # Only useful for Request
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
except Exception:
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def process_batch(lines: List[bytes]) -> List[dict]:
|
|
175
|
+
"""
|
|
176
|
+
批量处理行数据,减少函数调用开销
|
|
177
|
+
"""
|
|
178
|
+
results = []
|
|
179
|
+
for line in lines:
|
|
180
|
+
res = PacketParser.process_row(line)
|
|
181
|
+
if res:
|
|
182
|
+
results.append(res)
|
|
183
|
+
return results
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import dpkt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PcapSplitter:
|
|
10
|
+
"""
|
|
11
|
+
Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
|
|
12
|
+
based on TCP flows, dynamically balanced for parallel processing.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, pcap_file: str, output_dir: str):
|
|
16
|
+
self.pcap_file = pcap_file
|
|
17
|
+
self.output_dir = output_dir
|
|
18
|
+
|
|
19
|
+
def get_stream_key(self, tcp, ip) -> Tuple:
|
|
20
|
+
"""Generate a 5-tuple key for the flow."""
|
|
21
|
+
src = ip.src
|
|
22
|
+
dst = ip.dst
|
|
23
|
+
sport = tcp.sport
|
|
24
|
+
dport = tcp.dport
|
|
25
|
+
# Canonicalize bidirectional flows to the same key
|
|
26
|
+
key1 = (src, dst, sport, dport)
|
|
27
|
+
key2 = (dst, src, dport, sport)
|
|
28
|
+
return key1 if key1 < key2 else key2
|
|
29
|
+
|
|
30
|
+
def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
|
|
31
|
+
"""
|
|
32
|
+
Split the pcap file into balanced chunks based on stream volume (bytes).
|
|
33
|
+
Uses a Greedy Partition Algorithm (Longest Processing Time first).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
threshold_mb: File size threshold in MB. If smaller, do not split.
|
|
37
|
+
default_chunks: Number of chunks to split into if threshold is exceeded.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of generated file paths (or original file if not split).
|
|
41
|
+
"""
|
|
42
|
+
if not os.path.exists(self.pcap_file):
|
|
43
|
+
raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
|
|
44
|
+
|
|
45
|
+
file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
|
|
46
|
+
if file_size_mb < threshold_mb:
|
|
47
|
+
print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
|
|
48
|
+
return [self.pcap_file]
|
|
49
|
+
|
|
50
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
start_time = time.time()
|
|
53
|
+
# Dictionary to store packets: stream_key -> list of (ts, buf)
|
|
54
|
+
streams = defaultdict(list)
|
|
55
|
+
# Dictionary to store total size: stream_key -> total_bytes
|
|
56
|
+
stream_sizes = defaultdict(int)
|
|
57
|
+
|
|
58
|
+
# 1. Read and Group Packets
|
|
59
|
+
print(f"Reading {self.pcap_file}...")
|
|
60
|
+
with open(self.pcap_file, "rb") as f:
|
|
61
|
+
if self.pcap_file.lower().endswith(".pcapng"):
|
|
62
|
+
reader = dpkt.pcapng.Reader(f)
|
|
63
|
+
else:
|
|
64
|
+
reader = dpkt.pcap.Reader(f)
|
|
65
|
+
|
|
66
|
+
for ts, buf in reader:
|
|
67
|
+
try:
|
|
68
|
+
eth = dpkt.ethernet.Ethernet(buf)
|
|
69
|
+
if not isinstance(eth.data, dpkt.ip.IP):
|
|
70
|
+
continue
|
|
71
|
+
ip = eth.data
|
|
72
|
+
if not isinstance(ip.data, dpkt.tcp.TCP):
|
|
73
|
+
continue
|
|
74
|
+
tcp = ip.data
|
|
75
|
+
|
|
76
|
+
key = self.get_stream_key(tcp, ip)
|
|
77
|
+
streams[key].append((ts, buf))
|
|
78
|
+
stream_sizes[key] += len(buf)
|
|
79
|
+
except Exception:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
total_streams = len(streams)
|
|
83
|
+
print(f"Found {total_streams} TCP streams.")
|
|
84
|
+
|
|
85
|
+
if total_streams == 0:
|
|
86
|
+
print("No TCP streams found to split.")
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
# 2. Assign Streams to Buckets (Greedy LPT Algorithm)
|
|
90
|
+
num_chunks = min(default_chunks, total_streams)
|
|
91
|
+
|
|
92
|
+
# Sort streams by size (descending)
|
|
93
|
+
sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
|
|
94
|
+
|
|
95
|
+
# Buckets: list of (current_size, batch_index, list_of_keys)
|
|
96
|
+
# We perform standard list sort to find min bucket, sufficient for small N
|
|
97
|
+
buckets = [[0, i, []] for i in range(num_chunks)]
|
|
98
|
+
|
|
99
|
+
for key, size in sorted_streams:
|
|
100
|
+
# Find bucket with smallest current size
|
|
101
|
+
buckets.sort(key=lambda x: x[0])
|
|
102
|
+
smallest_bucket = buckets[0]
|
|
103
|
+
|
|
104
|
+
# Add stream to this bucket
|
|
105
|
+
smallest_bucket[0] += size
|
|
106
|
+
smallest_bucket[2].append(key)
|
|
107
|
+
|
|
108
|
+
print(f"Splitting into {num_chunks} files with volume balancing...")
|
|
109
|
+
generated_files = []
|
|
110
|
+
|
|
111
|
+
# 3. Write Batches
|
|
112
|
+
# Sort buckets by index ensures file naming order 0, 1, 2...
|
|
113
|
+
buckets.sort(key=lambda x: x[1])
|
|
114
|
+
|
|
115
|
+
for size, i, batch_keys in buckets:
|
|
116
|
+
out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
|
|
117
|
+
generated_files.append(out_file_path)
|
|
118
|
+
|
|
119
|
+
with open(out_file_path, "wb") as f:
|
|
120
|
+
writer = dpkt.pcap.Writer(f)
|
|
121
|
+
for key in batch_keys:
|
|
122
|
+
for ts, buf in streams[key]:
|
|
123
|
+
writer.writepkt(buf, ts)
|
|
124
|
+
|
|
125
|
+
print(f" - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
|
|
126
|
+
|
|
127
|
+
print(f"Split completed in {time.time() - start_time:.2f}s")
|
|
128
|
+
return generated_files
|
|
@@ -15,8 +15,9 @@ def configure_logger(logger_name, level=logging.DEBUG) -> logging.Logger:
|
|
|
15
15
|
console_handler.setFormatter(formatter)
|
|
16
16
|
return logger
|
|
17
17
|
|
|
18
|
+
|
|
18
19
|
logger = configure_logger("FlowAnalyzer", logging.INFO)
|
|
19
20
|
|
|
20
|
-
if __name__ ==
|
|
21
|
+
if __name__ == "__main__":
|
|
21
22
|
logger = configure_logger("FlowAnalyzer")
|
|
22
23
|
logger.info("This is a test!")
|