FlowAnalyzer 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ import os
4
4
  import sqlite3
5
5
  import subprocess
6
6
  from dataclasses import dataclass
7
- from typing import Dict, Iterable, NamedTuple, Optional, Tuple
7
+ from typing import Iterable, NamedTuple, Optional, Tuple
8
8
  from urllib import parse
9
9
 
10
10
  import ijson
@@ -69,57 +69,54 @@ class FlowAnalyzer:
69
69
  if not os.path.exists(self.db_path):
70
70
  raise FileNotFoundError(f"未找到数据文件或缓存数据库: {self.db_path},请先调用 get_json_data 生成。")
71
71
 
72
- def _load_from_db(self) -> Tuple[Dict[int, Request], Dict[int, Response]]:
73
- """ SQLite 数据库加载数据"""
74
- requests, responses = {}, {}
75
- try:
76
- with sqlite3.connect(self.db_path) as conn:
77
- cursor = conn.cursor()
72
+ def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
73
+ """生成HTTP请求和响应信息的字典对 (SQL JOIN 高性能版)"""
74
+ if not os.path.exists(self.db_path):
75
+ return
78
76
 
79
- # 简单防错检查
80
- try:
81
- cursor.execute("SELECT count(*) FROM requests")
82
- if cursor.fetchone()[0] == 0:
83
- cursor.execute("SELECT count(*) FROM responses")
84
- if cursor.fetchone()[0] == 0:
85
- return {}, {}
86
- except sqlite3.OperationalError:
87
- logger.error("数据库损坏或表不存在")
88
- return {}, {}
89
-
90
- logger.debug(f"正在加载缓存数据: {self.db_path}")
91
-
92
- # 加载 Requests
93
- cursor.execute("SELECT frame_num, header, file_data, full_uri, time_epoch FROM requests")
94
- for row in cursor.fetchall():
95
- requests[row[0]] = Request(row[0], row[1], row[2], row[3], row[4])
96
-
97
- # 加载 Responses
98
- cursor.execute("SELECT frame_num, header, file_data, time_epoch, request_in FROM responses")
99
- for row in cursor.fetchall():
100
- responses[row[0]] = Response(row[0], row[1], row[2], row[3], row[4])
101
-
102
- return requests, responses
103
- except sqlite3.Error as e:
104
- logger.error(f"读取数据库出错: {e}")
105
- return {}, {}
77
+ with sqlite3.connect(self.db_path) as conn:
78
+ cursor = conn.cursor()
79
+ # 开启查询优化
80
+ cursor.execute("PRAGMA query_only = 1;")
81
+
82
+ # === 第一步:配对查询 ===
83
+ # 利用 SQLite 的 LEFT JOIN 直接匹配请求和响应
84
+ # 避免将所有数据加载到 Python 内存中
85
+ sql_pair = """
86
+ SELECT
87
+ req.frame_num, req.header, req.file_data, req.full_uri, req.time_epoch, -- 0-4 (Request)
88
+ resp.frame_num, resp.header, resp.file_data, resp.time_epoch, resp.request_in -- 5-9 (Response)
89
+ FROM requests req
90
+ LEFT JOIN responses resp ON req.frame_num = resp.request_in
91
+ ORDER BY req.frame_num ASC
92
+ """
93
+
94
+ cursor.execute(sql_pair)
95
+
96
+ # 流式遍历结果,内存占用极低
97
+ for row in cursor:
98
+ # 构建 Request 对象
99
+ # 注意处理 NULL 情况,虽然 requests 表理论上不为空,但防万一用 or b''
100
+ req = Request(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", full_uri=row[3] or "", time_epoch=row[4])
101
+
102
+ resp = None
103
+ # 如果 row[5] (Response frame_num) 不为空,说明匹配到了响应
104
+ if row[5] is not None:
105
+ resp = Response(frame_num=row[5], header=row[6] or b"", file_data=row[7] or b"", time_epoch=row[8], _request_in=row[9])
106
106
 
107
- def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
108
- """生成HTTP请求和响应信息的字典对"""
109
- requests, responses = self._load_from_db()
110
- response_map = {r._request_in: r for r in responses.values()}
111
- yielded_resps = set()
112
-
113
- for req_id, req in requests.items():
114
- resp = response_map.get(req_id)
115
- if resp:
116
- yielded_resps.add(resp.frame_num)
117
107
  yield HttpPair(request=req, response=resp)
118
- else:
119
- yield HttpPair(request=req, response=None)
120
108
 
121
- for resp in responses.values():
122
- if resp.frame_num not in yielded_resps:
109
+ # === 第二步:孤儿响应查询 ===
110
+ # 找出那些有 request_in 但找不到对应 Request 的响应包
111
+ sql_orphan = """
112
+ SELECT frame_num, header, file_data, time_epoch, request_in
113
+ FROM responses
114
+ WHERE request_in NOT IN (SELECT frame_num FROM requests)
115
+ """
116
+ cursor.execute(sql_orphan)
117
+
118
+ for row in cursor:
119
+ resp = Response(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", time_epoch=row[3], _request_in=row[4])
123
120
  yield HttpPair(request=None, response=resp)
124
121
 
125
122
  # =========================================================================
@@ -220,6 +217,9 @@ class FlowAnalyzer:
220
217
  cursor.execute("CREATE TABLE requests (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, full_uri TEXT, time_epoch REAL)")
221
218
  cursor.execute("CREATE TABLE responses (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, time_epoch REAL, request_in INTEGER)")
222
219
 
220
+ # === 核心优化:增加索引,极大加速 SQL JOIN 配对 ===
221
+ cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
222
+
223
223
  cursor.execute("""
224
224
  CREATE TABLE meta_info (
225
225
  id INTEGER PRIMARY KEY,
@@ -345,10 +345,7 @@ class FlowAnalyzer:
345
345
 
346
346
  @staticmethod
347
347
  def dechunck_http_response(file_data: bytes) -> bytes:
348
- """解码分块TCP数据 (修复版)
349
- 注意:如果数据不是 Chunked 格式,此函数必须抛出异常,
350
- 以便外层逻辑回退到使用原始数据。
351
- """
348
+ """解码分块TCP数据"""
352
349
  if not file_data:
353
350
  return b""
354
351
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -7,7 +7,7 @@ with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8"
7
7
 
8
8
  setup(
9
9
  name="FlowAnalyzer",
10
- version="0.4.1",
10
+ version="0.4.2",
11
11
  description="FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件",
12
12
  author="Byxs20",
13
13
  author_email="97766819@qq.com",
File without changes
File without changes
File without changes