FlowAnalyzer 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- FlowAnalyzer/FlowAnalyzer.py +121 -83
- {flowanalyzer-0.4.1.dist-info → flowanalyzer-0.4.3.dist-info}/METADATA +2 -10
- flowanalyzer-0.4.3.dist-info/RECORD +9 -0
- {flowanalyzer-0.4.1.dist-info → flowanalyzer-0.4.3.dist-info}/WHEEL +1 -1
- flowanalyzer-0.4.1.dist-info/RECORD +0 -9
- {flowanalyzer-0.4.1.dist-info/licenses → flowanalyzer-0.4.3.dist-info}/LICENSE +0 -0
- {flowanalyzer-0.4.1.dist-info → flowanalyzer-0.4.3.dist-info}/top_level.txt +0 -0
FlowAnalyzer/FlowAnalyzer.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
+
import csv
|
|
2
3
|
import gzip
|
|
3
4
|
import os
|
|
4
5
|
import sqlite3
|
|
5
6
|
import subprocess
|
|
6
7
|
from dataclasses import dataclass
|
|
7
|
-
from typing import
|
|
8
|
+
from typing import Iterable, NamedTuple, Optional, Tuple
|
|
8
9
|
from urllib import parse
|
|
9
10
|
|
|
10
|
-
import ijson
|
|
11
|
-
|
|
12
11
|
from .logging_config import logger
|
|
13
12
|
from .Path import get_default_tshark_path
|
|
14
13
|
|
|
@@ -69,57 +68,54 @@ class FlowAnalyzer:
|
|
|
69
68
|
if not os.path.exists(self.db_path):
|
|
70
69
|
raise FileNotFoundError(f"未找到数据文件或缓存数据库: {self.db_path},请先调用 get_json_data 生成。")
|
|
71
70
|
|
|
72
|
-
def
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
with sqlite3.connect(self.db_path) as conn:
|
|
77
|
-
cursor = conn.cursor()
|
|
71
|
+
def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
|
|
72
|
+
"""生成HTTP请求和响应信息的字典对 (SQL JOIN 高性能版)"""
|
|
73
|
+
if not os.path.exists(self.db_path):
|
|
74
|
+
return
|
|
78
75
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
76
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
77
|
+
cursor = conn.cursor()
|
|
78
|
+
# 开启查询优化
|
|
79
|
+
cursor.execute("PRAGMA query_only = 1;")
|
|
80
|
+
|
|
81
|
+
# === 第一步:配对查询 ===
|
|
82
|
+
# 利用 SQLite 的 LEFT JOIN 直接匹配请求和响应
|
|
83
|
+
# 避免将所有数据加载到 Python 内存中
|
|
84
|
+
sql_pair = """
|
|
85
|
+
SELECT
|
|
86
|
+
req.frame_num, req.header, req.file_data, req.full_uri, req.time_epoch, -- 0-4 (Request)
|
|
87
|
+
resp.frame_num, resp.header, resp.file_data, resp.time_epoch, resp.request_in -- 5-9 (Response)
|
|
88
|
+
FROM requests req
|
|
89
|
+
LEFT JOIN responses resp ON req.frame_num = resp.request_in
|
|
90
|
+
ORDER BY req.frame_num ASC
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
cursor.execute(sql_pair)
|
|
94
|
+
|
|
95
|
+
# 流式遍历结果,内存占用极低
|
|
96
|
+
for row in cursor:
|
|
97
|
+
# 构建 Request 对象
|
|
98
|
+
# 注意处理 NULL 情况,虽然 requests 表理论上不为空,但防万一用 or b''
|
|
99
|
+
req = Request(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", full_uri=row[3] or "", time_epoch=row[4])
|
|
100
|
+
|
|
101
|
+
resp = None
|
|
102
|
+
# 如果 row[5] (Response frame_num) 不为空,说明匹配到了响应
|
|
103
|
+
if row[5] is not None:
|
|
104
|
+
resp = Response(frame_num=row[5], header=row[6] or b"", file_data=row[7] or b"", time_epoch=row[8], _request_in=row[9])
|
|
106
105
|
|
|
107
|
-
def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
|
|
108
|
-
"""生成HTTP请求和响应信息的字典对"""
|
|
109
|
-
requests, responses = self._load_from_db()
|
|
110
|
-
response_map = {r._request_in: r for r in responses.values()}
|
|
111
|
-
yielded_resps = set()
|
|
112
|
-
|
|
113
|
-
for req_id, req in requests.items():
|
|
114
|
-
resp = response_map.get(req_id)
|
|
115
|
-
if resp:
|
|
116
|
-
yielded_resps.add(resp.frame_num)
|
|
117
106
|
yield HttpPair(request=req, response=resp)
|
|
118
|
-
else:
|
|
119
|
-
yield HttpPair(request=req, response=None)
|
|
120
107
|
|
|
121
|
-
|
|
122
|
-
|
|
108
|
+
# === 第二步:孤儿响应查询 ===
|
|
109
|
+
# 找出那些有 request_in 但找不到对应 Request 的响应包
|
|
110
|
+
sql_orphan = """
|
|
111
|
+
SELECT frame_num, header, file_data, time_epoch, request_in
|
|
112
|
+
FROM responses
|
|
113
|
+
WHERE request_in NOT IN (SELECT frame_num FROM requests)
|
|
114
|
+
"""
|
|
115
|
+
cursor.execute(sql_orphan)
|
|
116
|
+
|
|
117
|
+
for row in cursor:
|
|
118
|
+
resp = Response(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", time_epoch=row[3], _request_in=row[4])
|
|
123
119
|
yield HttpPair(request=None, response=resp)
|
|
124
120
|
|
|
125
121
|
# =========================================================================
|
|
@@ -208,6 +204,14 @@ class FlowAnalyzer:
|
|
|
208
204
|
@staticmethod
|
|
209
205
|
def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
|
|
210
206
|
"""流式解析并存入DB,同时记录元数据"""
|
|
207
|
+
# 增加 CSV 字段大小限制,防止超大包报错
|
|
208
|
+
# 将限制设置为系统最大值,注意 32位系统不要超过 2GB (但 Python int通常是动态的,保险起见设大一点)
|
|
209
|
+
# Windows下 sys.maxsize 通常足够大
|
|
210
|
+
try:
|
|
211
|
+
csv.field_size_limit(500 * 1024 * 1024) # 500 MB
|
|
212
|
+
except Exception:
|
|
213
|
+
# 如果失败,尝试取最大值
|
|
214
|
+
csv.field_size_limit(int(2**31 - 1))
|
|
211
215
|
|
|
212
216
|
if os.path.exists(db_path):
|
|
213
217
|
os.remove(db_path)
|
|
@@ -231,6 +235,7 @@ class FlowAnalyzer:
|
|
|
231
235
|
""")
|
|
232
236
|
conn.commit()
|
|
233
237
|
|
|
238
|
+
# 修改命令为 -T fields 模式
|
|
234
239
|
command = [
|
|
235
240
|
tshark_path,
|
|
236
241
|
"-r",
|
|
@@ -238,55 +243,75 @@ class FlowAnalyzer:
|
|
|
238
243
|
"-Y",
|
|
239
244
|
f"({display_filter})",
|
|
240
245
|
"-T",
|
|
241
|
-
"
|
|
246
|
+
"fields",
|
|
247
|
+
# 指定输出字段
|
|
242
248
|
"-e",
|
|
243
|
-
"http.response.code",
|
|
249
|
+
"http.response.code", # 0
|
|
244
250
|
"-e",
|
|
245
|
-
"http.request_in",
|
|
251
|
+
"http.request_in", # 1
|
|
246
252
|
"-e",
|
|
247
|
-
"tcp.reassembled.data",
|
|
253
|
+
"tcp.reassembled.data", # 2
|
|
248
254
|
"-e",
|
|
249
|
-
"frame.number",
|
|
255
|
+
"frame.number", # 3
|
|
250
256
|
"-e",
|
|
251
|
-
"tcp.payload",
|
|
257
|
+
"tcp.payload", # 4
|
|
252
258
|
"-e",
|
|
253
|
-
"frame.time_epoch",
|
|
259
|
+
"frame.time_epoch", # 5
|
|
254
260
|
"-e",
|
|
255
|
-
"exported_pdu.exported_pdu",
|
|
261
|
+
"exported_pdu.exported_pdu", # 6
|
|
256
262
|
"-e",
|
|
257
|
-
"http.request.full_uri",
|
|
263
|
+
"http.request.full_uri", # 7
|
|
264
|
+
# 格式控制
|
|
265
|
+
"-E",
|
|
266
|
+
"header=n", # 不输出表头
|
|
267
|
+
"-E",
|
|
268
|
+
"separator=|", # 使用 | 分割 (比逗号更安全)
|
|
269
|
+
"-E",
|
|
270
|
+
"quote=d", # 双引号包裹
|
|
271
|
+
"-E",
|
|
272
|
+
"occurrence=f", # 每个字段只取第一个值 (First)
|
|
258
273
|
]
|
|
259
274
|
|
|
260
275
|
logger.debug(f"执行 Tshark: {command}")
|
|
261
276
|
|
|
262
|
-
|
|
277
|
+
# 使用 utf-8 编码读取 stdout text mode
|
|
278
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)), encoding="utf-8", errors="replace")
|
|
263
279
|
|
|
264
280
|
db_req_rows = []
|
|
265
281
|
db_resp_rows = []
|
|
266
282
|
BATCH_SIZE = 5000
|
|
267
283
|
|
|
268
284
|
try:
|
|
269
|
-
|
|
270
|
-
|
|
285
|
+
# 使用 csv.reader 解析 stdout 流
|
|
286
|
+
reader = csv.reader(process.stdout, delimiter="|", quotechar='"') # type: ignore
|
|
271
287
|
with sqlite3.connect(db_path) as conn:
|
|
272
288
|
cursor = conn.cursor()
|
|
273
289
|
|
|
274
|
-
for
|
|
275
|
-
|
|
276
|
-
|
|
290
|
+
for row in reader:
|
|
291
|
+
# row 是一个列表,对应上面的 -e 顺序
|
|
292
|
+
# [code, req_in, reassembled, frame, payload, epoch, pdu, uri]
|
|
293
|
+
if not row:
|
|
277
294
|
continue
|
|
278
295
|
|
|
279
296
|
try:
|
|
280
|
-
|
|
297
|
+
# 解析数据
|
|
298
|
+
frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(row)
|
|
299
|
+
|
|
281
300
|
if not full_request:
|
|
282
301
|
continue
|
|
302
|
+
|
|
283
303
|
header, file_data = FlowAnalyzer.extract_http_file_data(full_request)
|
|
284
304
|
|
|
285
|
-
|
|
305
|
+
# 判断是请求还是响应
|
|
306
|
+
# http.response.code (index 0) 是否为空
|
|
307
|
+
if row[0]:
|
|
308
|
+
# Response
|
|
286
309
|
db_resp_rows.append((frame_num, header, file_data, time_epoch, request_in))
|
|
287
310
|
else:
|
|
311
|
+
# Request
|
|
288
312
|
db_req_rows.append((frame_num, header, file_data, full_uri, time_epoch))
|
|
289
313
|
|
|
314
|
+
# 批量插入
|
|
290
315
|
if len(db_req_rows) >= BATCH_SIZE:
|
|
291
316
|
cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
|
|
292
317
|
db_req_rows.clear()
|
|
@@ -294,14 +319,19 @@ class FlowAnalyzer:
|
|
|
294
319
|
cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
|
|
295
320
|
db_resp_rows.clear()
|
|
296
321
|
|
|
297
|
-
except Exception:
|
|
322
|
+
except Exception as e:
|
|
323
|
+
# 偶尔可能会有解析失败的行,跳过即可
|
|
298
324
|
pass
|
|
299
325
|
|
|
326
|
+
# 插入剩余数据
|
|
300
327
|
if db_req_rows:
|
|
301
328
|
cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
|
|
302
329
|
if db_resp_rows:
|
|
303
330
|
cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
|
|
304
331
|
|
|
332
|
+
# --- 优化点:插入完数据后再创建索引,速度更快 ---
|
|
333
|
+
cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
|
|
334
|
+
|
|
305
335
|
pcap_mtime = os.path.getmtime(pcap_path)
|
|
306
336
|
pcap_size = os.path.getsize(pcap_path)
|
|
307
337
|
cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
|
|
@@ -319,18 +349,29 @@ class FlowAnalyzer:
|
|
|
319
349
|
# --- 辅助静态方法 ---
|
|
320
350
|
|
|
321
351
|
@staticmethod
|
|
322
|
-
def parse_packet_data(
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
352
|
+
def parse_packet_data(row: list) -> Tuple[int, int, float, str, str]:
|
|
353
|
+
# row definition:
|
|
354
|
+
# 0: http.response.code
|
|
355
|
+
# 1: http.request_in
|
|
356
|
+
# 2: tcp.reassembled.data
|
|
357
|
+
# 3: frame.number
|
|
358
|
+
# 4: tcp.payload
|
|
359
|
+
# 5: frame.time_epoch
|
|
360
|
+
# 6: exported_pdu.exported_pdu
|
|
361
|
+
# 7: http.request.full_uri
|
|
362
|
+
|
|
363
|
+
frame_num = int(row[3])
|
|
364
|
+
request_in = int(row[1]) if row[1] else frame_num
|
|
365
|
+
full_uri = parse.unquote(row[7]) if row[7] else ""
|
|
366
|
+
time_epoch = float(row[5])
|
|
367
|
+
|
|
368
|
+
if row[2]:
|
|
369
|
+
full_request = row[2]
|
|
370
|
+
elif row[4]:
|
|
371
|
+
full_request = row[4]
|
|
332
372
|
else:
|
|
333
|
-
full_request =
|
|
373
|
+
full_request = row[6] if row[6] else ""
|
|
374
|
+
|
|
334
375
|
return frame_num, request_in, time_epoch, full_uri, full_request
|
|
335
376
|
|
|
336
377
|
@staticmethod
|
|
@@ -345,10 +386,7 @@ class FlowAnalyzer:
|
|
|
345
386
|
|
|
346
387
|
@staticmethod
|
|
347
388
|
def dechunck_http_response(file_data: bytes) -> bytes:
|
|
348
|
-
"""解码分块TCP数据
|
|
349
|
-
注意:如果数据不是 Chunked 格式,此函数必须抛出异常,
|
|
350
|
-
以便外层逻辑回退到使用原始数据。
|
|
351
|
-
"""
|
|
389
|
+
"""解码分块TCP数据"""
|
|
352
390
|
if not file_data:
|
|
353
391
|
return b""
|
|
354
392
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: FlowAnalyzer
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
|
|
5
5
|
Home-page: https://github.com/Byxs20/FlowAnalyzer
|
|
6
6
|
Author: Byxs20
|
|
@@ -15,14 +15,6 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
-
Dynamic: author
|
|
19
|
-
Dynamic: author-email
|
|
20
|
-
Dynamic: classifier
|
|
21
|
-
Dynamic: description
|
|
22
|
-
Dynamic: description-content-type
|
|
23
|
-
Dynamic: home-page
|
|
24
|
-
Dynamic: license-file
|
|
25
|
-
Dynamic: summary
|
|
26
18
|
|
|
27
19
|
# FlowAnalyzer
|
|
28
20
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
FlowAnalyzer/FlowAnalyzer.py,sha256=9seSOamepCnejHYRKLWym9Eu0lbxCgn7p3hE2WUZstk,18964
|
|
2
|
+
FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
|
|
3
|
+
FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
|
|
4
|
+
FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
|
|
5
|
+
flowanalyzer-0.4.3.dist-info/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
|
|
6
|
+
flowanalyzer-0.4.3.dist-info/METADATA,sha256=W6BhXCna1TYeTVd_gY5Q63xjbckhRpomHYErrtS5fBM,5588
|
|
7
|
+
flowanalyzer-0.4.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
8
|
+
flowanalyzer-0.4.3.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
|
|
9
|
+
flowanalyzer-0.4.3.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
FlowAnalyzer/FlowAnalyzer.py,sha256=ciuWFPQWQgYqjdL_u7ck4BNIsQNx00HLOjr6lSkfzMg,17348
|
|
2
|
-
FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
|
|
3
|
-
FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
|
|
4
|
-
FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
|
|
5
|
-
flowanalyzer-0.4.1.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
|
|
6
|
-
flowanalyzer-0.4.1.dist-info/METADATA,sha256=WD01CpYRDVbT8RA5GwTKYZPv8Fa06_-4ZuiTAa5SfeE,5767
|
|
7
|
-
flowanalyzer-0.4.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
8
|
-
flowanalyzer-0.4.1.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
|
|
9
|
-
flowanalyzer-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|