FlowAnalyzer 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- FlowAnalyzer/FlowAnalyzer.py +68 -244
- FlowAnalyzer/Models.py +27 -0
- FlowAnalyzer/PacketParser.py +185 -0
- FlowAnalyzer/logging_config.py +2 -1
- {flowanalyzer-0.4.3.dist-info → flowanalyzer-0.4.4.dist-info}/METADATA +19 -9
- flowanalyzer-0.4.4.dist-info/RECORD +11 -0
- {flowanalyzer-0.4.3.dist-info → flowanalyzer-0.4.4.dist-info}/WHEEL +1 -1
- flowanalyzer-0.4.3.dist-info/RECORD +0 -9
- {flowanalyzer-0.4.3.dist-info → flowanalyzer-0.4.4.dist-info/licenses}/LICENSE +0 -0
- {flowanalyzer-0.4.3.dist-info → flowanalyzer-0.4.4.dist-info}/top_level.txt +0 -0
FlowAnalyzer/FlowAnalyzer.py
CHANGED
|
@@ -1,47 +1,20 @@
|
|
|
1
|
-
import contextlib
|
|
2
|
-
import csv
|
|
3
|
-
import gzip
|
|
4
1
|
import os
|
|
5
2
|
import sqlite3
|
|
6
3
|
import subprocess
|
|
7
|
-
from
|
|
8
|
-
from typing import Iterable,
|
|
9
|
-
from urllib import parse
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
5
|
+
from typing import Iterable, Optional
|
|
10
6
|
|
|
11
7
|
from .logging_config import logger
|
|
8
|
+
from .Models import HttpPair, Request, Response
|
|
9
|
+
from .PacketParser import PacketParser
|
|
12
10
|
from .Path import get_default_tshark_path
|
|
13
11
|
|
|
14
12
|
|
|
15
|
-
@dataclass
|
|
16
|
-
class Request:
|
|
17
|
-
__slots__ = ("frame_num", "header", "file_data", "full_uri", "time_epoch")
|
|
18
|
-
frame_num: int
|
|
19
|
-
header: bytes
|
|
20
|
-
file_data: bytes
|
|
21
|
-
full_uri: str
|
|
22
|
-
time_epoch: float
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@dataclass
|
|
26
|
-
class Response:
|
|
27
|
-
__slots__ = ("frame_num", "header", "file_data", "time_epoch", "_request_in")
|
|
28
|
-
frame_num: int
|
|
29
|
-
header: bytes
|
|
30
|
-
file_data: bytes
|
|
31
|
-
time_epoch: float
|
|
32
|
-
_request_in: Optional[int]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class HttpPair(NamedTuple):
|
|
36
|
-
request: Optional[Request]
|
|
37
|
-
response: Optional[Response]
|
|
38
|
-
|
|
39
|
-
|
|
40
13
|
class FlowAnalyzer:
|
|
41
14
|
"""
|
|
42
15
|
FlowAnalyzer 流量分析器 (智能缓存版)
|
|
43
16
|
特点:
|
|
44
|
-
1. Tshark -> Pipe ->
|
|
17
|
+
1. Tshark -> Pipe -> ThreadPool -> SQLite
|
|
45
18
|
2. 智能校验:自动比对 Filter 和文件修改时间,防止缓存错乱
|
|
46
19
|
3. 存储优化:数据库文件生成在流量包同级目录下
|
|
47
20
|
"""
|
|
@@ -80,7 +53,6 @@ class FlowAnalyzer:
|
|
|
80
53
|
|
|
81
54
|
# === 第一步:配对查询 ===
|
|
82
55
|
# 利用 SQLite 的 LEFT JOIN 直接匹配请求和响应
|
|
83
|
-
# 避免将所有数据加载到 Python 内存中
|
|
84
56
|
sql_pair = """
|
|
85
57
|
SELECT
|
|
86
58
|
req.frame_num, req.header, req.file_data, req.full_uri, req.time_epoch, -- 0-4 (Request)
|
|
@@ -94,19 +66,15 @@ class FlowAnalyzer:
|
|
|
94
66
|
|
|
95
67
|
# 流式遍历结果,内存占用极低
|
|
96
68
|
for row in cursor:
|
|
97
|
-
# 构建 Request 对象
|
|
98
|
-
# 注意处理 NULL 情况,虽然 requests 表理论上不为空,但防万一用 or b''
|
|
99
69
|
req = Request(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", full_uri=row[3] or "", time_epoch=row[4])
|
|
100
70
|
|
|
101
71
|
resp = None
|
|
102
|
-
# 如果 row[5] (Response frame_num) 不为空,说明匹配到了响应
|
|
103
72
|
if row[5] is not None:
|
|
104
73
|
resp = Response(frame_num=row[5], header=row[6] or b"", file_data=row[7] or b"", time_epoch=row[8], _request_in=row[9])
|
|
105
74
|
|
|
106
75
|
yield HttpPair(request=req, response=resp)
|
|
107
76
|
|
|
108
77
|
# === 第二步:孤儿响应查询 ===
|
|
109
|
-
# 找出那些有 request_in 但找不到对应 Request 的响应包
|
|
110
78
|
sql_orphan = """
|
|
111
79
|
SELECT frame_num, header, file_data, time_epoch, request_in
|
|
112
80
|
FROM responses
|
|
@@ -126,33 +94,21 @@ class FlowAnalyzer:
|
|
|
126
94
|
def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
|
|
127
95
|
"""
|
|
128
96
|
获取数据路径 (智能校验版)。
|
|
129
|
-
|
|
130
|
-
逻辑:
|
|
131
|
-
1. 根据 PCAP 路径推算 DB 路径 (位于 PCAP 同级目录)。
|
|
132
|
-
2. 检查 DB 是否存在。
|
|
133
|
-
3. 检查 Filter 和文件元数据是否一致。
|
|
134
|
-
4. 若一致返回路径,不一致则重新解析。
|
|
135
97
|
"""
|
|
136
98
|
if not os.path.exists(file_path):
|
|
137
99
|
raise FileNotFoundError("流量包路径不存在:%s" % file_path)
|
|
138
100
|
|
|
139
|
-
# --- 修改处:获取流量包的绝对路径和所在目录 ---
|
|
140
101
|
abs_file_path = os.path.abspath(file_path)
|
|
141
|
-
pcap_dir = os.path.dirname(abs_file_path)
|
|
102
|
+
pcap_dir = os.path.dirname(abs_file_path)
|
|
142
103
|
base_name = os.path.splitext(os.path.basename(abs_file_path))[0]
|
|
143
|
-
|
|
144
|
-
# 将 db_path 拼接在流量包所在的目录下
|
|
145
104
|
db_path = os.path.join(pcap_dir, f"{base_name}.db")
|
|
146
|
-
# ----------------------------------------
|
|
147
105
|
|
|
148
|
-
# --- 校验环节 ---
|
|
149
106
|
if FlowAnalyzer._is_cache_valid(db_path, abs_file_path, display_filter):
|
|
150
107
|
logger.debug(f"缓存校验通过 (Filter匹配且文件未变),使用缓存: [{db_path}]")
|
|
151
108
|
return db_path
|
|
152
109
|
else:
|
|
153
110
|
logger.debug(f"缓存失效或不存在 (Filter变更或文件更新),开始重新解析...")
|
|
154
111
|
|
|
155
|
-
# --- 解析环节 ---
|
|
156
112
|
tshark_path = FlowAnalyzer.get_tshark_path(tshark_path)
|
|
157
113
|
FlowAnalyzer._stream_tshark_to_db(abs_file_path, display_filter, tshark_path, db_path)
|
|
158
114
|
|
|
@@ -160,17 +116,10 @@ class FlowAnalyzer:
|
|
|
160
116
|
|
|
161
117
|
@staticmethod
|
|
162
118
|
def get_db_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
|
|
163
|
-
"""
|
|
164
|
-
获取数据库路径 (get_json_data 的语义化别名)。
|
|
165
|
-
新项目建议使用此方法名,get_json_data 保留用于兼容旧习惯。
|
|
166
|
-
"""
|
|
167
119
|
return FlowAnalyzer.get_json_data(file_path, display_filter, tshark_path)
|
|
168
120
|
|
|
169
121
|
@staticmethod
|
|
170
122
|
def _is_cache_valid(db_path: str, pcap_path: str, current_filter: str) -> bool:
|
|
171
|
-
"""
|
|
172
|
-
检查缓存有效性:对比 Filter 字符串和文件元数据
|
|
173
|
-
"""
|
|
174
123
|
if not os.path.exists(db_path) or os.path.getsize(db_path) == 0:
|
|
175
124
|
return False
|
|
176
125
|
|
|
@@ -188,7 +137,6 @@ class FlowAnalyzer:
|
|
|
188
137
|
|
|
189
138
|
cached_filter, cached_mtime, cached_size = row
|
|
190
139
|
|
|
191
|
-
# 容差 0.1秒
|
|
192
140
|
if cached_filter == current_filter and cached_size == current_size and abs(cached_mtime - current_mtime) < 0.1:
|
|
193
141
|
return True
|
|
194
142
|
else:
|
|
@@ -203,16 +151,7 @@ class FlowAnalyzer:
|
|
|
203
151
|
|
|
204
152
|
@staticmethod
|
|
205
153
|
def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
|
|
206
|
-
"""流式解析并存入DB
|
|
207
|
-
# 增加 CSV 字段大小限制,防止超大包报错
|
|
208
|
-
# 将限制设置为系统最大值,注意 32位系统不要超过 2GB (但 Python int通常是动态的,保险起见设大一点)
|
|
209
|
-
# Windows下 sys.maxsize 通常足够大
|
|
210
|
-
try:
|
|
211
|
-
csv.field_size_limit(500 * 1024 * 1024) # 500 MB
|
|
212
|
-
except Exception:
|
|
213
|
-
# 如果失败,尝试取最大值
|
|
214
|
-
csv.field_size_limit(int(2**31 - 1))
|
|
215
|
-
|
|
154
|
+
"""流式解析并存入DB (多线程版)"""
|
|
216
155
|
if os.path.exists(db_path):
|
|
217
156
|
os.remove(db_path)
|
|
218
157
|
|
|
@@ -235,7 +174,6 @@ class FlowAnalyzer:
|
|
|
235
174
|
""")
|
|
236
175
|
conn.commit()
|
|
237
176
|
|
|
238
|
-
# 修改命令为 -T fields 模式
|
|
239
177
|
command = [
|
|
240
178
|
tshark_path,
|
|
241
179
|
"-r",
|
|
@@ -244,7 +182,6 @@ class FlowAnalyzer:
|
|
|
244
182
|
f"({display_filter})",
|
|
245
183
|
"-T",
|
|
246
184
|
"fields",
|
|
247
|
-
# 指定输出字段
|
|
248
185
|
"-e",
|
|
249
186
|
"http.response.code", # 0
|
|
250
187
|
"-e",
|
|
@@ -261,81 +198,93 @@ class FlowAnalyzer:
|
|
|
261
198
|
"exported_pdu.exported_pdu", # 6
|
|
262
199
|
"-e",
|
|
263
200
|
"http.request.full_uri", # 7
|
|
264
|
-
|
|
201
|
+
"-e",
|
|
202
|
+
"http.file_data", # 8
|
|
203
|
+
"-e",
|
|
204
|
+
"tcp.segment.count", # 9
|
|
265
205
|
"-E",
|
|
266
|
-
"header=n",
|
|
206
|
+
"header=n",
|
|
267
207
|
"-E",
|
|
268
|
-
"separator
|
|
208
|
+
"separator=/t",
|
|
269
209
|
"-E",
|
|
270
|
-
"quote=
|
|
210
|
+
"quote=n",
|
|
271
211
|
"-E",
|
|
272
|
-
"occurrence=f",
|
|
212
|
+
"occurrence=f",
|
|
273
213
|
]
|
|
274
214
|
|
|
275
215
|
logger.debug(f"执行 Tshark: {command}")
|
|
216
|
+
BATCH_SIZE = 2000
|
|
217
|
+
MAX_PENDING_BATCHES = 20 # 控制内存中待处理的批次数量 (Backpressure)
|
|
276
218
|
|
|
277
|
-
# 使用
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
db_req_rows = []
|
|
281
|
-
db_resp_rows = []
|
|
282
|
-
BATCH_SIZE = 5000
|
|
219
|
+
# 使用 ThreadPoolExecutor 并行处理数据
|
|
220
|
+
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
|
283
221
|
|
|
222
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)))
|
|
284
223
|
try:
|
|
285
|
-
# 使用 csv.reader 解析 stdout 流
|
|
286
|
-
reader = csv.reader(process.stdout, delimiter="|", quotechar='"') # type: ignore
|
|
287
224
|
with sqlite3.connect(db_path) as conn:
|
|
288
225
|
cursor = conn.cursor()
|
|
289
226
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
# [
|
|
293
|
-
if not row:
|
|
294
|
-
continue
|
|
227
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
228
|
+
current_batch = []
|
|
229
|
+
pending_futures = [] # List[Future]
|
|
295
230
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
231
|
+
def write_results_to_db(results):
|
|
232
|
+
"""将一批处理好的结果写入数据库"""
|
|
233
|
+
if not results:
|
|
234
|
+
return
|
|
299
235
|
|
|
300
|
-
|
|
301
|
-
|
|
236
|
+
db_req_rows = []
|
|
237
|
+
db_resp_rows = []
|
|
302
238
|
|
|
303
|
-
|
|
239
|
+
for item in results:
|
|
240
|
+
if item["type"] == "response":
|
|
241
|
+
db_resp_rows.append((item["frame_num"], item["header"], item["file_data"], item["time_epoch"], item["request_in"]))
|
|
242
|
+
else:
|
|
243
|
+
db_req_rows.append((item["frame_num"], item["header"], item["file_data"], item["full_uri"], item["time_epoch"]))
|
|
304
244
|
|
|
305
|
-
|
|
306
|
-
# http.response.code (index 0) 是否为空
|
|
307
|
-
if row[0]:
|
|
308
|
-
# Response
|
|
309
|
-
db_resp_rows.append((frame_num, header, file_data, time_epoch, request_in))
|
|
310
|
-
else:
|
|
311
|
-
# Request
|
|
312
|
-
db_req_rows.append((frame_num, header, file_data, full_uri, time_epoch))
|
|
313
|
-
|
|
314
|
-
# 批量插入
|
|
315
|
-
if len(db_req_rows) >= BATCH_SIZE:
|
|
245
|
+
if db_req_rows:
|
|
316
246
|
cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
|
|
317
|
-
|
|
318
|
-
if len(db_resp_rows) >= BATCH_SIZE:
|
|
247
|
+
if db_resp_rows:
|
|
319
248
|
cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
|
|
320
|
-
db_resp_rows.clear()
|
|
321
249
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
250
|
+
def submit_batch():
|
|
251
|
+
"""提交当前批次到线程池"""
|
|
252
|
+
if not current_batch:
|
|
253
|
+
return
|
|
325
254
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
255
|
+
# Copy batch data for the thread (list slicing is fast)
|
|
256
|
+
batch_data = current_batch[:]
|
|
257
|
+
future = executor.submit(PacketParser.process_batch, batch_data)
|
|
258
|
+
pending_futures.append(future)
|
|
259
|
+
current_batch.clear()
|
|
331
260
|
|
|
332
|
-
|
|
333
|
-
|
|
261
|
+
# --- Main Pipeline Loop ---
|
|
262
|
+
if process.stdout:
|
|
263
|
+
for line in process.stdout:
|
|
264
|
+
current_batch.append(line)
|
|
265
|
+
|
|
266
|
+
if len(current_batch) >= BATCH_SIZE:
|
|
267
|
+
submit_batch()
|
|
268
|
+
|
|
269
|
+
# Backpressure: 如果积压的任务太多,主线程暂停读取,先处理掉最早的一个
|
|
270
|
+
# 这样既保证了 Pipeline 流动,又防止内存爆掉
|
|
271
|
+
if len(pending_futures) >= MAX_PENDING_BATCHES:
|
|
272
|
+
oldest_future = pending_futures.pop(0)
|
|
273
|
+
write_results_to_db(oldest_future.result())
|
|
274
|
+
|
|
275
|
+
# --- Drain Pipeline ---
|
|
276
|
+
# 提交剩余数据
|
|
277
|
+
submit_batch()
|
|
334
278
|
|
|
279
|
+
# 等待所有剩余任务完成
|
|
280
|
+
for future in pending_futures:
|
|
281
|
+
write_results_to_db(future.result())
|
|
282
|
+
|
|
283
|
+
# 创建索引和元数据
|
|
284
|
+
cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
|
|
335
285
|
pcap_mtime = os.path.getmtime(pcap_path)
|
|
336
286
|
pcap_size = os.path.getsize(pcap_path)
|
|
337
287
|
cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
|
|
338
|
-
|
|
339
288
|
conn.commit()
|
|
340
289
|
|
|
341
290
|
except Exception as e:
|
|
@@ -346,131 +295,6 @@ class FlowAnalyzer:
|
|
|
346
295
|
if process.poll() is None:
|
|
347
296
|
process.terminate()
|
|
348
297
|
|
|
349
|
-
# --- 辅助静态方法 ---
|
|
350
|
-
|
|
351
|
-
@staticmethod
|
|
352
|
-
def parse_packet_data(row: list) -> Tuple[int, int, float, str, str]:
|
|
353
|
-
# row definition:
|
|
354
|
-
# 0: http.response.code
|
|
355
|
-
# 1: http.request_in
|
|
356
|
-
# 2: tcp.reassembled.data
|
|
357
|
-
# 3: frame.number
|
|
358
|
-
# 4: tcp.payload
|
|
359
|
-
# 5: frame.time_epoch
|
|
360
|
-
# 6: exported_pdu.exported_pdu
|
|
361
|
-
# 7: http.request.full_uri
|
|
362
|
-
|
|
363
|
-
frame_num = int(row[3])
|
|
364
|
-
request_in = int(row[1]) if row[1] else frame_num
|
|
365
|
-
full_uri = parse.unquote(row[7]) if row[7] else ""
|
|
366
|
-
time_epoch = float(row[5])
|
|
367
|
-
|
|
368
|
-
if row[2]:
|
|
369
|
-
full_request = row[2]
|
|
370
|
-
elif row[4]:
|
|
371
|
-
full_request = row[4]
|
|
372
|
-
else:
|
|
373
|
-
full_request = row[6] if row[6] else ""
|
|
374
|
-
|
|
375
|
-
return frame_num, request_in, time_epoch, full_uri, full_request
|
|
376
|
-
|
|
377
|
-
@staticmethod
|
|
378
|
-
def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
|
|
379
|
-
headerEnd = file_data.find(b"\r\n\r\n")
|
|
380
|
-
if headerEnd != -1:
|
|
381
|
-
return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
|
|
382
|
-
elif file_data.find(b"\n\n") != -1:
|
|
383
|
-
headerEnd = file_data.index(b"\n\n") + 2
|
|
384
|
-
return file_data[:headerEnd], file_data[headerEnd:]
|
|
385
|
-
return b"", file_data
|
|
386
|
-
|
|
387
|
-
@staticmethod
|
|
388
|
-
def dechunck_http_response(file_data: bytes) -> bytes:
|
|
389
|
-
"""解码分块TCP数据"""
|
|
390
|
-
if not file_data:
|
|
391
|
-
return b""
|
|
392
|
-
|
|
393
|
-
chunks = []
|
|
394
|
-
cursor = 0
|
|
395
|
-
total_len = len(file_data)
|
|
396
|
-
|
|
397
|
-
while cursor < total_len:
|
|
398
|
-
# 1. 寻找当前 Chunk Size 行的结束符 (\n)
|
|
399
|
-
newline_idx = file_data.find(b"\n", cursor)
|
|
400
|
-
if newline_idx == -1:
|
|
401
|
-
# 找不到换行符,说明格式不对,抛出异常让外层处理
|
|
402
|
-
raise ValueError("Not chunked data")
|
|
403
|
-
|
|
404
|
-
# 2. 提取并解析十六进制大小
|
|
405
|
-
size_line = file_data[cursor:newline_idx].strip()
|
|
406
|
-
|
|
407
|
-
# 处理可能的空行 (例如上一个 Chunk 后的 CRLF)
|
|
408
|
-
if not size_line:
|
|
409
|
-
cursor = newline_idx + 1
|
|
410
|
-
continue
|
|
411
|
-
|
|
412
|
-
# 这里不要捕获 ValueError,如果解析失败,直接抛出
|
|
413
|
-
# 说明这根本不是 chunk size,而是普通数据
|
|
414
|
-
chunk_size = int(size_line, 16)
|
|
415
|
-
|
|
416
|
-
# Chunk Size 为 0 表示传输结束
|
|
417
|
-
if chunk_size == 0:
|
|
418
|
-
break
|
|
419
|
-
|
|
420
|
-
# 3. 定位数据区域
|
|
421
|
-
data_start = newline_idx + 1
|
|
422
|
-
data_end = data_start + chunk_size
|
|
423
|
-
|
|
424
|
-
if data_end > total_len:
|
|
425
|
-
# 数据被截断,尽力读取
|
|
426
|
-
chunks.append(file_data[data_start:])
|
|
427
|
-
break
|
|
428
|
-
|
|
429
|
-
# 4. 提取数据
|
|
430
|
-
chunks.append(file_data[data_start:data_end])
|
|
431
|
-
|
|
432
|
-
# 5. 移动游标
|
|
433
|
-
cursor = data_end
|
|
434
|
-
# 跳过尾随的 \r 和 \n
|
|
435
|
-
while cursor < total_len and file_data[cursor] in (13, 10):
|
|
436
|
-
cursor += 1
|
|
437
|
-
|
|
438
|
-
return b"".join(chunks)
|
|
439
|
-
|
|
440
|
-
@staticmethod
|
|
441
|
-
def extract_http_file_data(full_request: str) -> Tuple[bytes, bytes]:
|
|
442
|
-
"""提取HTTP请求或响应中的文件数据 (修复版)"""
|
|
443
|
-
# 1. 基础校验
|
|
444
|
-
if not full_request:
|
|
445
|
-
return b"", b""
|
|
446
|
-
|
|
447
|
-
try:
|
|
448
|
-
# 转为二进制
|
|
449
|
-
raw_bytes = bytes.fromhex(full_request)
|
|
450
|
-
|
|
451
|
-
# 分割 Header 和 Body
|
|
452
|
-
header, file_data = FlowAnalyzer.split_http_headers(raw_bytes)
|
|
453
|
-
|
|
454
|
-
# 处理 Chunked 编码
|
|
455
|
-
with contextlib.suppress(Exception):
|
|
456
|
-
file_data = FlowAnalyzer.dechunck_http_response(file_data)
|
|
457
|
-
|
|
458
|
-
# 处理 Gzip 压缩
|
|
459
|
-
with contextlib.suppress(Exception):
|
|
460
|
-
if file_data.startswith(b"\x1f\x8b"):
|
|
461
|
-
file_data = gzip.decompress(file_data)
|
|
462
|
-
|
|
463
|
-
return header, file_data
|
|
464
|
-
|
|
465
|
-
except ValueError as e:
|
|
466
|
-
# 专门捕获 Hex 转换错误,并打印出来,方便你调试
|
|
467
|
-
# 如果你在控制台看到这个错误,说明 Tshark 输出的数据格式非常奇怪
|
|
468
|
-
logger.error(f"Hex转换失败: {str(e)[:100]}... 原数据片段: {full_request[:50]}")
|
|
469
|
-
return b"", b""
|
|
470
|
-
except Exception as e:
|
|
471
|
-
logger.error(f"解析HTTP数据未知错误: {e}")
|
|
472
|
-
return b"", b""
|
|
473
|
-
|
|
474
298
|
@staticmethod
|
|
475
299
|
def get_tshark_path(tshark_path: Optional[str]) -> str:
|
|
476
300
|
default_tshark_path = get_default_tshark_path()
|
FlowAnalyzer/Models.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import NamedTuple, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Request:
|
|
7
|
+
__slots__ = ("frame_num", "header", "file_data", "full_uri", "time_epoch")
|
|
8
|
+
frame_num: int
|
|
9
|
+
header: bytes
|
|
10
|
+
file_data: bytes
|
|
11
|
+
full_uri: str
|
|
12
|
+
time_epoch: float
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Response:
|
|
17
|
+
__slots__ = ("frame_num", "header", "file_data", "time_epoch", "_request_in")
|
|
18
|
+
frame_num: int
|
|
19
|
+
header: bytes
|
|
20
|
+
file_data: bytes
|
|
21
|
+
time_epoch: float
|
|
22
|
+
_request_in: Optional[int]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HttpPair(NamedTuple):
|
|
26
|
+
request: Optional[Request]
|
|
27
|
+
response: Optional[Response]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import binascii
|
|
2
|
+
import contextlib
|
|
3
|
+
import gzip
|
|
4
|
+
from typing import List, Optional, Tuple
|
|
5
|
+
from urllib import parse
|
|
6
|
+
|
|
7
|
+
from .logging_config import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PacketParser:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes, bytes]:
|
|
13
|
+
"""
|
|
14
|
+
解析 Tshark 输出的一行数据
|
|
15
|
+
row definition (all bytes):
|
|
16
|
+
0: http.response.code
|
|
17
|
+
1: http.request_in
|
|
18
|
+
2: tcp.reassembled.data
|
|
19
|
+
3: frame.number
|
|
20
|
+
4: tcp.payload
|
|
21
|
+
5: frame.time_epoch
|
|
22
|
+
6: exported_pdu.exported_pdu
|
|
23
|
+
7: http.request.full_uri
|
|
24
|
+
8: http.file_data
|
|
25
|
+
9: tcp.segment.count
|
|
26
|
+
"""
|
|
27
|
+
frame_num = int(row[3])
|
|
28
|
+
request_in = int(row[1]) if row[1] else frame_num
|
|
29
|
+
# Decode only URI to string
|
|
30
|
+
full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
|
|
31
|
+
time_epoch = float(row[5])
|
|
32
|
+
http_file_data = row[8] if len(row) > 8 else b""
|
|
33
|
+
|
|
34
|
+
# Logic for Raw Packet (Header Source)
|
|
35
|
+
is_reassembled = len(row) > 9 and row[9]
|
|
36
|
+
|
|
37
|
+
if is_reassembled and row[2]:
|
|
38
|
+
full_request = row[2]
|
|
39
|
+
elif row[4]:
|
|
40
|
+
full_request = row[4]
|
|
41
|
+
else:
|
|
42
|
+
# Fallback (e.g. Exported PDU)
|
|
43
|
+
full_request = row[2] if row[2] else (row[6] if row[6] else b"")
|
|
44
|
+
|
|
45
|
+
return frame_num, request_in, time_epoch, full_uri, full_request, http_file_data
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
|
|
49
|
+
headerEnd = file_data.find(b"\r\n\r\n")
|
|
50
|
+
if headerEnd != -1:
|
|
51
|
+
return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
|
|
52
|
+
elif file_data.find(b"\n\n") != -1:
|
|
53
|
+
headerEnd = file_data.index(b"\n\n") + 2
|
|
54
|
+
return file_data[:headerEnd], file_data[headerEnd:]
|
|
55
|
+
return b"", file_data
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def dechunk_http_response(file_data: bytes) -> bytes:
|
|
59
|
+
"""解码分块TCP数据"""
|
|
60
|
+
if not file_data:
|
|
61
|
+
return b""
|
|
62
|
+
|
|
63
|
+
chunks = []
|
|
64
|
+
cursor = 0
|
|
65
|
+
total_len = len(file_data)
|
|
66
|
+
|
|
67
|
+
while cursor < total_len:
|
|
68
|
+
newline_idx = file_data.find(b"\n", cursor)
|
|
69
|
+
if newline_idx == -1:
|
|
70
|
+
raise ValueError("Not chunked data")
|
|
71
|
+
|
|
72
|
+
size_line = file_data[cursor:newline_idx].strip()
|
|
73
|
+
if not size_line:
|
|
74
|
+
cursor = newline_idx + 1
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
chunk_size = int(size_line, 16)
|
|
78
|
+
if chunk_size == 0:
|
|
79
|
+
break
|
|
80
|
+
|
|
81
|
+
data_start = newline_idx + 1
|
|
82
|
+
data_end = data_start + chunk_size
|
|
83
|
+
|
|
84
|
+
if data_end > total_len:
|
|
85
|
+
chunks.append(file_data[data_start:])
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
chunks.append(file_data[data_start:data_end])
|
|
89
|
+
|
|
90
|
+
cursor = data_end
|
|
91
|
+
while cursor < total_len and file_data[cursor] in (13, 10):
|
|
92
|
+
cursor += 1
|
|
93
|
+
|
|
94
|
+
return b"".join(chunks)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def extract_http_file_data(full_request: bytes, http_file_data: bytes) -> Tuple[bytes, bytes]:
|
|
98
|
+
"""
|
|
99
|
+
提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
|
|
100
|
+
"""
|
|
101
|
+
header = b""
|
|
102
|
+
file_data = b""
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
# --- 1. 提取 Header ---
|
|
106
|
+
if full_request:
|
|
107
|
+
raw_bytes = binascii.unhexlify(full_request)
|
|
108
|
+
h_part, _ = PacketParser.split_http_headers(raw_bytes)
|
|
109
|
+
header = h_part
|
|
110
|
+
|
|
111
|
+
# --- 2. 提取 Body ---
|
|
112
|
+
if http_file_data:
|
|
113
|
+
try:
|
|
114
|
+
file_data = binascii.unhexlify(http_file_data)
|
|
115
|
+
return header, file_data
|
|
116
|
+
except binascii.Error:
|
|
117
|
+
logger.warning("解析 http.file_data Hex 失败,尝试回退到原始方式")
|
|
118
|
+
|
|
119
|
+
# --- 3. 回退模式 (Fallback) ---
|
|
120
|
+
if full_request and not file_data:
|
|
121
|
+
raw_bytes = binascii.unhexlify(full_request)
|
|
122
|
+
_, body_part = PacketParser.split_http_headers(raw_bytes)
|
|
123
|
+
|
|
124
|
+
with contextlib.suppress(Exception):
|
|
125
|
+
body_part = PacketParser.dechunk_http_response(body_part)
|
|
126
|
+
|
|
127
|
+
with contextlib.suppress(Exception):
|
|
128
|
+
if body_part.startswith(b"\x1f\x8b"):
|
|
129
|
+
body_part = gzip.decompress(body_part)
|
|
130
|
+
|
|
131
|
+
file_data = body_part
|
|
132
|
+
return header, file_data
|
|
133
|
+
|
|
134
|
+
except ValueError as e:
|
|
135
|
+
logger.error(f"Hex转换失败: {str(e)[:100]}...")
|
|
136
|
+
return b"", b""
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.error(f"解析HTTP数据未知错误: {e}")
|
|
139
|
+
return b"", b""
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def process_row(line: bytes) -> Optional[dict]:
|
|
143
|
+
"""
|
|
144
|
+
处理单行数据,返回结构化结果供主线程写入
|
|
145
|
+
"""
|
|
146
|
+
line = line.rstrip(b"\r\n")
|
|
147
|
+
if not line:
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
row = line.split(b"\t")
|
|
151
|
+
try:
|
|
152
|
+
frame_num, request_in, time_epoch, full_uri, full_request, http_file_data = PacketParser.parse_packet_data(row)
|
|
153
|
+
|
|
154
|
+
if not full_request and not http_file_data:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
header, file_data = PacketParser.extract_http_file_data(full_request, http_file_data)
|
|
158
|
+
|
|
159
|
+
# row[0] is http.response.code (bytes)
|
|
160
|
+
is_response = bool(row[0])
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"type": "response" if is_response else "request",
|
|
164
|
+
"frame_num": frame_num,
|
|
165
|
+
"header": header,
|
|
166
|
+
"file_data": file_data,
|
|
167
|
+
"time_epoch": time_epoch,
|
|
168
|
+
"request_in": request_in, # Only useful for Response
|
|
169
|
+
"full_uri": full_uri, # Only useful for Request
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
except Exception:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def process_batch(lines: List[bytes]) -> List[dict]:
|
|
177
|
+
"""
|
|
178
|
+
批量处理行数据,减少函数调用开销
|
|
179
|
+
"""
|
|
180
|
+
results = []
|
|
181
|
+
for line in lines:
|
|
182
|
+
res = PacketParser.process_row(line)
|
|
183
|
+
if res:
|
|
184
|
+
results.append(res)
|
|
185
|
+
return results
|
FlowAnalyzer/logging_config.py
CHANGED
|
@@ -15,8 +15,9 @@ def configure_logger(logger_name, level=logging.DEBUG) -> logging.Logger:
|
|
|
15
15
|
console_handler.setFormatter(formatter)
|
|
16
16
|
return logger
|
|
17
17
|
|
|
18
|
+
|
|
18
19
|
logger = configure_logger("FlowAnalyzer", logging.INFO)
|
|
19
20
|
|
|
20
|
-
if __name__ ==
|
|
21
|
+
if __name__ == "__main__":
|
|
21
22
|
logger = configure_logger("FlowAnalyzer")
|
|
22
23
|
logger.info("This is a test!")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: FlowAnalyzer
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
|
|
5
5
|
Home-page: https://github.com/Byxs20/FlowAnalyzer
|
|
6
6
|
Author: Byxs20
|
|
@@ -15,6 +15,14 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
Dynamic: summary
|
|
18
26
|
|
|
19
27
|
# FlowAnalyzer
|
|
20
28
|
|
|
@@ -28,9 +36,11 @@ License-File: LICENSE
|
|
|
28
36
|
|
|
29
37
|
为了解决传统解析方式慢、内存占用高的问题,FlowAnalyzer 进行了核心架构升级:**流式解析 + SQLite 智能缓存**。
|
|
30
38
|
|
|
31
|
-
### 1. ⚡️ 高性能流式解析
|
|
32
|
-
-
|
|
33
|
-
-
|
|
39
|
+
### 1. ⚡️ 高性能流式解析 (多线程流水线)
|
|
40
|
+
- **多线程并行**:采用 `ThreadPoolExecutor` 构建流水线,主线程负责读取 Tshark 输出,子线程并行解析数据包,充分利用多核 CPU。
|
|
41
|
+
- **批量处理**:引入 Batch 机制(默认 2000 包/批),大幅减少数据库事务开销和 Python 函数调用损耗。
|
|
42
|
+
- **内存背压控制 (Backpressure)**:智能监控待处理队列长度,防止在处理高速流量时内存溢出。
|
|
43
|
+
- **极低内存占用**:不再将整个 JSON 读入内存。通过 `subprocess` 管道流式处理,解析过程中不生成体积巨大的临时文件。
|
|
34
44
|
|
|
35
45
|
### 2. 💾 智能缓存机制
|
|
36
46
|
- **自动缓存**:首次分析 `test.pcap` 时,会自动生成同级目录下的 `test.db`。
|
|
@@ -51,7 +61,7 @@ License-File: LICENSE
|
|
|
51
61
|
|
|
52
62
|
| 特性 | 旧版架构 | **新版架构 (FlowAnalyzer)** |
|
|
53
63
|
| :----------- | :---------------------------- | :---------------------------------- |
|
|
54
|
-
| **解析流程** | 生成巨大 JSON -> 全量读入内存 | Tshark流 ->
|
|
64
|
+
| **解析流程** | 生成巨大 JSON -> 全量读入内存 | Tshark流 -> 多线程Batch解析 -> SQLite |
|
|
55
65
|
| **内存占用** | 极高 (易 OOM) | **极低 (内存稳定)** |
|
|
56
66
|
| **二次加载** | 需重新解析 | **直接读取 DB (0秒)** |
|
|
57
67
|
| **磁盘占用** | 巨大的临时 JSON 文件 | 轻量级 SQLite 文件 |
|
|
@@ -63,11 +73,11 @@ License-File: LICENSE
|
|
|
63
73
|
请确保您的环境中已安装 Python 3 和 Tshark (Wireshark)。
|
|
64
74
|
|
|
65
75
|
```bash
|
|
66
|
-
# 安装 FlowAnalyzer
|
|
67
|
-
pip3 install FlowAnalyzer
|
|
76
|
+
# 安装 FlowAnalyzer
|
|
77
|
+
pip3 install FlowAnalyzer
|
|
68
78
|
|
|
69
79
|
# 或者使用国内源加速
|
|
70
|
-
pip3 install FlowAnalyzer
|
|
80
|
+
pip3 install FlowAnalyzer -i https://pypi.org/simple
|
|
71
81
|
```
|
|
72
82
|
|
|
73
83
|
---
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
FlowAnalyzer/FlowAnalyzer.py,sha256=GPXZeM1uiLmv_-UKtIwYlfYJ450Etpbtt4V2i_MpLhQ,12721
|
|
2
|
+
FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
|
|
3
|
+
FlowAnalyzer/PacketParser.py,sha256=fGql84e-tu1PDsXh3NxctKaSh5YeYsJbh5ZCUe6Mo40,6329
|
|
4
|
+
FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
|
|
5
|
+
FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
|
|
6
|
+
FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
|
|
7
|
+
flowanalyzer-0.4.4.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
|
|
8
|
+
flowanalyzer-0.4.4.dist-info/METADATA,sha256=aUWcp8_ocQIgz0k_3IlhEHrkzsLHYo7XBPqEayIOGc0,6099
|
|
9
|
+
flowanalyzer-0.4.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
flowanalyzer-0.4.4.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
|
|
11
|
+
flowanalyzer-0.4.4.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
FlowAnalyzer/FlowAnalyzer.py,sha256=9seSOamepCnejHYRKLWym9Eu0lbxCgn7p3hE2WUZstk,18964
|
|
2
|
-
FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
|
|
3
|
-
FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
|
|
4
|
-
FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
|
|
5
|
-
flowanalyzer-0.4.3.dist-info/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
|
|
6
|
-
flowanalyzer-0.4.3.dist-info/METADATA,sha256=W6BhXCna1TYeTVd_gY5Q63xjbckhRpomHYErrtS5fBM,5588
|
|
7
|
-
flowanalyzer-0.4.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
8
|
-
flowanalyzer-0.4.3.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
|
|
9
|
-
flowanalyzer-0.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|