FlowAnalyzer 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,13 @@
1
1
  import contextlib
2
+ import csv
2
3
  import gzip
3
4
  import os
4
5
  import sqlite3
5
6
  import subprocess
6
7
  from dataclasses import dataclass
7
- from typing import Dict, Iterable, NamedTuple, Optional, Tuple
8
+ from typing import Iterable, NamedTuple, Optional, Tuple
8
9
  from urllib import parse
9
10
 
10
- import ijson
11
-
12
11
  from .logging_config import logger
13
12
  from .Path import get_default_tshark_path
14
13
 
@@ -69,57 +68,54 @@ class FlowAnalyzer:
69
68
  if not os.path.exists(self.db_path):
70
69
  raise FileNotFoundError(f"未找到数据文件或缓存数据库: {self.db_path},请先调用 get_json_data 生成。")
71
70
 
72
- def _load_from_db(self) -> Tuple[Dict[int, Request], Dict[int, Response]]:
73
- """ SQLite 数据库加载数据"""
74
- requests, responses = {}, {}
75
- try:
76
- with sqlite3.connect(self.db_path) as conn:
77
- cursor = conn.cursor()
71
+ def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
72
+ """生成HTTP请求和响应信息的字典对 (SQL JOIN 高性能版)"""
73
+ if not os.path.exists(self.db_path):
74
+ return
78
75
 
79
- # 简单防错检查
80
- try:
81
- cursor.execute("SELECT count(*) FROM requests")
82
- if cursor.fetchone()[0] == 0:
83
- cursor.execute("SELECT count(*) FROM responses")
84
- if cursor.fetchone()[0] == 0:
85
- return {}, {}
86
- except sqlite3.OperationalError:
87
- logger.error("数据库损坏或表不存在")
88
- return {}, {}
89
-
90
- logger.debug(f"正在加载缓存数据: {self.db_path}")
91
-
92
- # 加载 Requests
93
- cursor.execute("SELECT frame_num, header, file_data, full_uri, time_epoch FROM requests")
94
- for row in cursor.fetchall():
95
- requests[row[0]] = Request(row[0], row[1], row[2], row[3], row[4])
96
-
97
- # 加载 Responses
98
- cursor.execute("SELECT frame_num, header, file_data, time_epoch, request_in FROM responses")
99
- for row in cursor.fetchall():
100
- responses[row[0]] = Response(row[0], row[1], row[2], row[3], row[4])
101
-
102
- return requests, responses
103
- except sqlite3.Error as e:
104
- logger.error(f"读取数据库出错: {e}")
105
- return {}, {}
76
+ with sqlite3.connect(self.db_path) as conn:
77
+ cursor = conn.cursor()
78
+ # 开启查询优化
79
+ cursor.execute("PRAGMA query_only = 1;")
80
+
81
+ # === 第一步:配对查询 ===
82
+ # 利用 SQLite 的 LEFT JOIN 直接匹配请求和响应
83
+ # 避免将所有数据加载到 Python 内存中
84
+ sql_pair = """
85
+ SELECT
86
+ req.frame_num, req.header, req.file_data, req.full_uri, req.time_epoch, -- 0-4 (Request)
87
+ resp.frame_num, resp.header, resp.file_data, resp.time_epoch, resp.request_in -- 5-9 (Response)
88
+ FROM requests req
89
+ LEFT JOIN responses resp ON req.frame_num = resp.request_in
90
+ ORDER BY req.frame_num ASC
91
+ """
92
+
93
+ cursor.execute(sql_pair)
94
+
95
+ # 流式遍历结果,内存占用极低
96
+ for row in cursor:
97
+ # 构建 Request 对象
98
+ # 注意处理 NULL 情况,虽然 requests 表理论上不为空,但防万一用 or b''
99
+ req = Request(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", full_uri=row[3] or "", time_epoch=row[4])
100
+
101
+ resp = None
102
+ # 如果 row[5] (Response frame_num) 不为空,说明匹配到了响应
103
+ if row[5] is not None:
104
+ resp = Response(frame_num=row[5], header=row[6] or b"", file_data=row[7] or b"", time_epoch=row[8], _request_in=row[9])
106
105
 
107
- def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
108
- """生成HTTP请求和响应信息的字典对"""
109
- requests, responses = self._load_from_db()
110
- response_map = {r._request_in: r for r in responses.values()}
111
- yielded_resps = set()
112
-
113
- for req_id, req in requests.items():
114
- resp = response_map.get(req_id)
115
- if resp:
116
- yielded_resps.add(resp.frame_num)
117
106
  yield HttpPair(request=req, response=resp)
118
- else:
119
- yield HttpPair(request=req, response=None)
120
107
 
121
- for resp in responses.values():
122
- if resp.frame_num not in yielded_resps:
108
+ # === 第二步:孤儿响应查询 ===
109
+ # 找出那些有 request_in 但找不到对应 Request 的响应包
110
+ sql_orphan = """
111
+ SELECT frame_num, header, file_data, time_epoch, request_in
112
+ FROM responses
113
+ WHERE request_in NOT IN (SELECT frame_num FROM requests)
114
+ """
115
+ cursor.execute(sql_orphan)
116
+
117
+ for row in cursor:
118
+ resp = Response(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", time_epoch=row[3], _request_in=row[4])
123
119
  yield HttpPair(request=None, response=resp)
124
120
 
125
121
  # =========================================================================
@@ -208,6 +204,14 @@ class FlowAnalyzer:
208
204
  @staticmethod
209
205
  def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
210
206
  """流式解析并存入DB,同时记录元数据"""
207
+ # 增加 CSV 字段大小限制,防止超大包报错
208
+ # 将限制设置为系统最大值,注意 32位系统不要超过 2GB (但 Python int通常是动态的,保险起见设大一点)
209
+ # Windows下 sys.maxsize 通常足够大
210
+ try:
211
+ csv.field_size_limit(500 * 1024 * 1024) # 500 MB
212
+ except Exception:
213
+ # 如果失败,尝试取最大值
214
+ csv.field_size_limit(int(2**31 - 1))
211
215
 
212
216
  if os.path.exists(db_path):
213
217
  os.remove(db_path)
@@ -231,6 +235,7 @@ class FlowAnalyzer:
231
235
  """)
232
236
  conn.commit()
233
237
 
238
+ # 修改命令为 -T fields 模式
234
239
  command = [
235
240
  tshark_path,
236
241
  "-r",
@@ -238,55 +243,75 @@ class FlowAnalyzer:
238
243
  "-Y",
239
244
  f"({display_filter})",
240
245
  "-T",
241
- "json",
246
+ "fields",
247
+ # 指定输出字段
242
248
  "-e",
243
- "http.response.code",
249
+ "http.response.code", # 0
244
250
  "-e",
245
- "http.request_in",
251
+ "http.request_in", # 1
246
252
  "-e",
247
- "tcp.reassembled.data",
253
+ "tcp.reassembled.data", # 2
248
254
  "-e",
249
- "frame.number",
255
+ "frame.number", # 3
250
256
  "-e",
251
- "tcp.payload",
257
+ "tcp.payload", # 4
252
258
  "-e",
253
- "frame.time_epoch",
259
+ "frame.time_epoch", # 5
254
260
  "-e",
255
- "exported_pdu.exported_pdu",
261
+ "exported_pdu.exported_pdu", # 6
256
262
  "-e",
257
- "http.request.full_uri",
263
+ "http.request.full_uri", # 7
264
+ # 格式控制
265
+ "-E",
266
+ "header=n", # 不输出表头
267
+ "-E",
268
+ "separator=|", # 使用 | 分割 (比逗号更安全)
269
+ "-E",
270
+ "quote=d", # 双引号包裹
271
+ "-E",
272
+ "occurrence=f", # 每个字段只取第一个值 (First)
258
273
  ]
259
274
 
260
275
  logger.debug(f"执行 Tshark: {command}")
261
276
 
262
- process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)))
277
+ # 使用 utf-8 编码读取 stdout text mode
278
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)), encoding="utf-8", errors="replace")
263
279
 
264
280
  db_req_rows = []
265
281
  db_resp_rows = []
266
282
  BATCH_SIZE = 5000
267
283
 
268
284
  try:
269
- parser = ijson.items(process.stdout, "item")
270
-
285
+ # 使用 csv.reader 解析 stdout
286
+ reader = csv.reader(process.stdout, delimiter="|", quotechar='"') # type: ignore
271
287
  with sqlite3.connect(db_path) as conn:
272
288
  cursor = conn.cursor()
273
289
 
274
- for packet in parser:
275
- layers = packet.get("_source", {}).get("layers", {})
276
- if not layers:
290
+ for row in reader:
291
+ # row 是一个列表,对应上面的 -e 顺序
292
+ # [code, req_in, reassembled, frame, payload, epoch, pdu, uri]
293
+ if not row:
277
294
  continue
278
295
 
279
296
  try:
280
- frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(layers)
297
+ # 解析数据
298
+ frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(row)
299
+
281
300
  if not full_request:
282
301
  continue
302
+
283
303
  header, file_data = FlowAnalyzer.extract_http_file_data(full_request)
284
304
 
285
- if layers.get("http.response.code"):
305
+ # 判断是请求还是响应
306
+ # http.response.code (index 0) 是否为空
307
+ if row[0]:
308
+ # Response
286
309
  db_resp_rows.append((frame_num, header, file_data, time_epoch, request_in))
287
310
  else:
311
+ # Request
288
312
  db_req_rows.append((frame_num, header, file_data, full_uri, time_epoch))
289
313
 
314
+ # 批量插入
290
315
  if len(db_req_rows) >= BATCH_SIZE:
291
316
  cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
292
317
  db_req_rows.clear()
@@ -294,14 +319,19 @@ class FlowAnalyzer:
294
319
  cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
295
320
  db_resp_rows.clear()
296
321
 
297
- except Exception:
322
+ except Exception as e:
323
+ # 偶尔可能会有解析失败的行,跳过即可
298
324
  pass
299
325
 
326
+ # 插入剩余数据
300
327
  if db_req_rows:
301
328
  cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
302
329
  if db_resp_rows:
303
330
  cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
304
331
 
332
+ # --- 优化点:插入完数据后再创建索引,速度更快 ---
333
+ cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
334
+
305
335
  pcap_mtime = os.path.getmtime(pcap_path)
306
336
  pcap_size = os.path.getsize(pcap_path)
307
337
  cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
@@ -319,18 +349,29 @@ class FlowAnalyzer:
319
349
  # --- 辅助静态方法 ---
320
350
 
321
351
  @staticmethod
322
- def parse_packet_data(packet: dict) -> Tuple[int, int, float, str, str]:
323
- frame_num = int(packet["frame.number"][0])
324
- request_in = int(packet["http.request_in"][0]) if packet.get("http.request_in") else frame_num
325
- full_uri = parse.unquote(packet["http.request.full_uri"][0]) if packet.get("http.request.full_uri") else ""
326
- time_epoch = float(packet["frame.time_epoch"][0])
327
-
328
- if packet.get("tcp.reassembled.data"):
329
- full_request = packet["tcp.reassembled.data"][0]
330
- elif packet.get("tcp.payload"):
331
- full_request = packet["tcp.payload"][0]
352
+ def parse_packet_data(row: list) -> Tuple[int, int, float, str, str]:
353
+ # row definition:
354
+ # 0: http.response.code
355
+ # 1: http.request_in
356
+ # 2: tcp.reassembled.data
357
+ # 3: frame.number
358
+ # 4: tcp.payload
359
+ # 5: frame.time_epoch
360
+ # 6: exported_pdu.exported_pdu
361
+ # 7: http.request.full_uri
362
+
363
+ frame_num = int(row[3])
364
+ request_in = int(row[1]) if row[1] else frame_num
365
+ full_uri = parse.unquote(row[7]) if row[7] else ""
366
+ time_epoch = float(row[5])
367
+
368
+ if row[2]:
369
+ full_request = row[2]
370
+ elif row[4]:
371
+ full_request = row[4]
332
372
  else:
333
- full_request = packet["exported_pdu.exported_pdu"][0] if packet.get("exported_pdu.exported_pdu") else ""
373
+ full_request = row[6] if row[6] else ""
374
+
334
375
  return frame_num, request_in, time_epoch, full_uri, full_request
335
376
 
336
377
  @staticmethod
@@ -345,10 +386,7 @@ class FlowAnalyzer:
345
386
 
346
387
  @staticmethod
347
388
  def dechunck_http_response(file_data: bytes) -> bytes:
348
- """解码分块TCP数据 (修复版)
349
- 注意:如果数据不是 Chunked 格式,此函数必须抛出异常,
350
- 以便外层逻辑回退到使用原始数据。
351
- """
389
+ """解码分块TCP数据"""
352
390
  if not file_data:
353
391
  return b""
354
392
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -15,14 +15,6 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Classifier: Programming Language :: Python :: 3.9
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Dynamic: author
19
- Dynamic: author-email
20
- Dynamic: classifier
21
- Dynamic: description
22
- Dynamic: description-content-type
23
- Dynamic: home-page
24
- Dynamic: license-file
25
- Dynamic: summary
26
18
 
27
19
  # FlowAnalyzer
28
20
 
@@ -0,0 +1,9 @@
1
+ FlowAnalyzer/FlowAnalyzer.py,sha256=9seSOamepCnejHYRKLWym9Eu0lbxCgn7p3hE2WUZstk,18964
2
+ FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
3
+ FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
4
+ FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
5
+ flowanalyzer-0.4.3.dist-info/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
6
+ flowanalyzer-0.4.3.dist-info/METADATA,sha256=W6BhXCna1TYeTVd_gY5Q63xjbckhRpomHYErrtS5fBM,5588
7
+ flowanalyzer-0.4.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
8
+ flowanalyzer-0.4.3.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
9
+ flowanalyzer-0.4.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.2)
2
+ Generator: setuptools (75.3.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- FlowAnalyzer/FlowAnalyzer.py,sha256=ciuWFPQWQgYqjdL_u7ck4BNIsQNx00HLOjr6lSkfzMg,17348
2
- FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
3
- FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
4
- FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
5
- flowanalyzer-0.4.1.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
6
- flowanalyzer-0.4.1.dist-info/METADATA,sha256=WD01CpYRDVbT8RA5GwTKYZPv8Fa06_-4ZuiTAa5SfeE,5767
7
- flowanalyzer-0.4.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
8
- flowanalyzer-0.4.1.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
9
- flowanalyzer-0.4.1.dist-info/RECORD,,