FlowAnalyzer 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,21 @@
1
1
  import contextlib
2
2
  import gzip
3
- import hashlib
4
- import json
5
3
  import os
4
+ import sqlite3
6
5
  import subprocess
6
+ from dataclasses import dataclass
7
7
  from typing import Dict, Iterable, NamedTuple, Optional, Tuple
8
8
  from urllib import parse
9
9
 
10
+ import ijson
11
+
10
12
  from .logging_config import logger
11
13
  from .Path import get_default_tshark_path
12
14
 
13
15
 
14
- class Request(NamedTuple):
16
+ @dataclass
17
+ class Request:
18
+ __slots__ = ("frame_num", "header", "file_data", "full_uri", "time_epoch")
15
19
  frame_num: int
16
20
  header: bytes
17
21
  file_data: bytes
@@ -19,12 +23,14 @@ class Request(NamedTuple):
19
23
  time_epoch: float
20
24
 
21
25
 
22
- class Response(NamedTuple):
26
+ @dataclass
27
+ class Response:
28
+ __slots__ = ("frame_num", "header", "file_data", "time_epoch", "_request_in")
23
29
  frame_num: int
24
30
  header: bytes
25
31
  file_data: bytes
26
- request_in: int
27
32
  time_epoch: float
33
+ _request_in: Optional[int]
28
34
 
29
35
 
30
36
  class HttpPair(NamedTuple):
@@ -33,293 +39,405 @@ class HttpPair(NamedTuple):
33
39
 
34
40
 
35
41
  class FlowAnalyzer:
36
- """FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件"""
37
-
38
- def __init__(self, json_path: str):
39
- """初始化FlowAnalyzer对象
40
-
41
- Parameters
42
- ----------
43
- json_path : str
44
- tshark导出的JSON文件路径
42
+ """
43
+ FlowAnalyzer 流量分析器 (智能缓存版)
44
+ 特点:
45
+ 1. Tshark -> Pipe -> ijson -> SQLite (无中间JSON文件)
46
+ 2. 智能校验:自动比对 Filter 和文件修改时间,防止缓存错乱
47
+ 3. 存储优化:数据库文件生成在流量包同级目录下
48
+ """
49
+
50
+ def __init__(self, db_path: str):
45
51
  """
46
- self.json_path = json_path
47
- self.check_json_file()
48
-
49
- def check_json_file(self):
50
- # sourcery skip: replace-interpolation-with-fstring
51
- """检查JSON文件是否存在并非空
52
-
53
- Raises
54
- ------
55
- FileNotFoundError
56
- 当JSON文件不存在时抛出异常
57
- ValueError
58
- 当JSON文件内容为空时抛出异常
52
+ 初始化 FlowAnalyzer
53
+ :param db_path: 数据库文件路径 (由 get_json_data 返回)
59
54
  """
60
- if not os.path.exists(self.json_path):
61
- raise FileNotFoundError("您的tshark导出的JSON文件没有找到!JSON路径:%s" % self.json_path)
62
-
63
- if os.path.getsize(self.json_path) == 0:
64
- raise ValueError("您的tshark导出的JSON文件内容为空!JSON路径:%s" % self.json_path)
65
-
66
- def parse_packet(self, packet: dict) -> Tuple[int, int, float, str, str]:
67
- """解析Json中的关键信息字段
68
-
69
- Parameters
70
- ----------
71
- packet : dict
72
- 传入Json字典
73
-
74
- Returns
75
- -------
76
- Tuple[int, int, float, str, str]
77
- frame_num, request_in, time_epoch, full_uri, full_request
78
- """
79
- frame_num = int(packet["frame.number"][0])
80
- request_in = int(packet["http.request_in"][0]) if packet.get("http.request_in") else frame_num
81
- full_uri = parse.unquote(packet["http.request.full_uri"][0]) if packet.get("http.request.full_uri") else ""
82
- time_epoch = float(packet["frame.time_epoch"][0])
83
-
84
- if packet.get("tcp.reassembled.data"):
85
- full_request = packet["tcp.reassembled.data"][0]
86
- elif packet.get("tcp.payload"):
87
- full_request = packet["tcp.payload"][0]
55
+ # 路径兼容处理
56
+ if db_path.endswith(".json"):
57
+ possible_db = db_path + ".db"
58
+ if os.path.exists(possible_db):
59
+ self.db_path = possible_db
60
+ else:
61
+ self.db_path = db_path
88
62
  else:
89
- # exported_pdu.exported_pdu
90
- full_request = packet["exported_pdu.exported_pdu"][0]
91
- return frame_num, request_in, time_epoch, full_uri, full_request
63
+ self.db_path = db_path
92
64
 
93
- def parse_http_json(self) -> Tuple[Dict[int, Request], Dict[int, Response]]:
94
- """解析JSON数据文件中的HTTP请求和响应信息
65
+ self.check_db_file()
95
66
 
96
- Returns
97
- -------
98
- tuple
99
- 包含请求字典和响应列表的元组
100
- """
101
- with open(self.json_path, "r", encoding="utf-8") as f:
102
- data = json.load(f)
67
+ def check_db_file(self):
68
+ """检查数据库文件是否存在"""
69
+ if not os.path.exists(self.db_path):
70
+ raise FileNotFoundError(f"未找到数据文件或缓存数据库: {self.db_path},请先调用 get_json_data 生成。")
103
71
 
72
+ def _load_from_db(self) -> Tuple[Dict[int, Request], Dict[int, Response]]:
73
+ """从 SQLite 数据库加载数据"""
104
74
  requests, responses = {}, {}
105
- for packet in data:
106
- packet = packet["_source"]["layers"]
107
- frame_num, request_in, time_epoch, full_uri, full_request = self.parse_packet(packet)
108
- header, file_data = self.extract_http_file_data(full_request)
109
-
110
- # 请求包使用 full_uri 来记录请求 url 返回包使用 request_in 来记录请求包的序号
111
- if packet.get("http.response.code"):
112
- responses[frame_num] = Response(
113
- frame_num=frame_num,
114
- request_in=request_in,
115
- header=header,
116
- file_data=file_data,
117
- time_epoch=time_epoch,
118
- )
119
- else:
120
- requests[frame_num] = Request(
121
- frame_num=frame_num, header=header, file_data=file_data, time_epoch=time_epoch, full_uri=full_uri
122
- )
123
- return requests, responses
124
-
125
- def generate_http_dict_pairs(self) -> Iterable[HttpPair]: # sourcery skip: use-named-expression
126
- """生成HTTP请求和响应信息的字典对
127
- Yields
128
- ------
129
- Iterable[HttpPair]
130
- 包含请求和响应信息的字典迭代器
131
- """
132
- requests, responses = self.parse_http_json()
133
- response_map = {r.request_in: r for r in responses.values()}
134
- yielded_resps = []
75
+ try:
76
+ with sqlite3.connect(self.db_path) as conn:
77
+ cursor = conn.cursor()
78
+
79
+ # 简单防错检查
80
+ try:
81
+ cursor.execute("SELECT count(*) FROM requests")
82
+ if cursor.fetchone()[0] == 0:
83
+ cursor.execute("SELECT count(*) FROM responses")
84
+ if cursor.fetchone()[0] == 0:
85
+ return {}, {}
86
+ except sqlite3.OperationalError:
87
+ logger.error("数据库损坏或表不存在")
88
+ return {}, {}
89
+
90
+ logger.debug(f"正在加载缓存数据: {self.db_path}")
91
+
92
+ # 加载 Requests
93
+ cursor.execute("SELECT frame_num, header, file_data, full_uri, time_epoch FROM requests")
94
+ for row in cursor.fetchall():
95
+ requests[row[0]] = Request(row[0], row[1], row[2], row[3], row[4])
96
+
97
+ # 加载 Responses
98
+ cursor.execute("SELECT frame_num, header, file_data, time_epoch, request_in FROM responses")
99
+ for row in cursor.fetchall():
100
+ responses[row[0]] = Response(row[0], row[1], row[2], row[3], row[4])
101
+
102
+ return requests, responses
103
+ except sqlite3.Error as e:
104
+ logger.error(f"读取数据库出错: {e}")
105
+ return {}, {}
106
+
107
+ def generate_http_dict_pairs(self) -> Iterable[HttpPair]:
108
+ """生成HTTP请求和响应信息的字典对"""
109
+ requests, responses = self._load_from_db()
110
+ response_map = {r._request_in: r for r in responses.values()}
111
+ yielded_resps = set()
112
+
135
113
  for req_id, req in requests.items():
136
114
  resp = response_map.get(req_id)
137
115
  if resp:
138
- yielded_resps.append(resp)
139
- resp = resp._replace(request_in=None)
116
+ yielded_resps.add(resp.frame_num)
140
117
  yield HttpPair(request=req, response=resp)
141
118
  else:
142
119
  yield HttpPair(request=req, response=None)
143
120
 
144
- for resp in response_map.values():
145
- if resp not in yielded_resps:
146
- resp = resp._replace(request_in=None)
121
+ for resp in responses.values():
122
+ if resp.frame_num not in yielded_resps:
147
123
  yield HttpPair(request=None, response=resp)
148
124
 
125
+ # =========================================================================
126
+ # 静态方法区域:包含校验逻辑和流式处理
127
+ # =========================================================================
128
+
149
129
  @staticmethod
150
- def get_hash(file_path: str, display_filter: str) -> str:
151
- with open(file_path, "rb") as f:
152
- return hashlib.md5(f.read() + display_filter.encode()).hexdigest()
130
+ def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
131
+ """
132
+ 获取数据路径 (智能校验版)
133
+
134
+ 逻辑:
135
+ 1. 根据 PCAP 路径推算 DB 路径 (位于 PCAP 同级目录)。
136
+ 2. 检查 DB 是否存在。
137
+ 3. 检查 Filter 和文件元数据是否一致。
138
+ 4. 若一致返回路径,不一致则重新解析。
139
+ """
140
+ if not os.path.exists(file_path):
141
+ raise FileNotFoundError("流量包路径不存在:%s" % file_path)
142
+
143
+ # --- 修改处:获取流量包的绝对路径和所在目录 ---
144
+ abs_file_path = os.path.abspath(file_path)
145
+ pcap_dir = os.path.dirname(abs_file_path) # 获取文件所在的文件夹
146
+ base_name = os.path.splitext(os.path.basename(abs_file_path))[0]
147
+
148
+ # 将 db_path 拼接在流量包所在的目录下
149
+ db_path = os.path.join(pcap_dir, f"{base_name}.db")
150
+ # ----------------------------------------
151
+
152
+ # --- 校验环节 ---
153
+ if FlowAnalyzer._is_cache_valid(db_path, abs_file_path, display_filter):
154
+ logger.debug(f"缓存校验通过 (Filter匹配且文件未变),使用缓存: [{db_path}]")
155
+ return db_path
156
+ else:
157
+ logger.debug(f"缓存失效或不存在 (Filter变更或文件更新),开始重新解析...")
158
+
159
+ # --- 解析环节 ---
160
+ tshark_path = FlowAnalyzer.get_tshark_path(tshark_path)
161
+ FlowAnalyzer._stream_tshark_to_db(abs_file_path, display_filter, tshark_path, db_path)
162
+
163
+ return db_path
164
+
165
+ @staticmethod
166
+ def get_db_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
167
+ """
168
+ 获取数据库路径 (get_json_data 的语义化别名)。
169
+ 新项目建议使用此方法名,get_json_data 保留用于兼容旧习惯。
170
+ """
171
+ return FlowAnalyzer.get_json_data(file_path, display_filter, tshark_path)
172
+
173
+ @staticmethod
174
+ def _is_cache_valid(db_path: str, pcap_path: str, current_filter: str) -> bool:
175
+ """
176
+ 检查缓存有效性:对比 Filter 字符串和文件元数据
177
+ """
178
+ if not os.path.exists(db_path) or os.path.getsize(db_path) == 0:
179
+ return False
180
+
181
+ try:
182
+ current_mtime = os.path.getmtime(pcap_path)
183
+ current_size = os.path.getsize(pcap_path)
184
+
185
+ with sqlite3.connect(db_path) as conn:
186
+ cursor = conn.cursor()
187
+ cursor.execute("SELECT filter, pcap_mtime, pcap_size FROM meta_info LIMIT 1")
188
+ row = cursor.fetchone()
189
+
190
+ if not row:
191
+ return False
192
+
193
+ cached_filter, cached_mtime, cached_size = row
194
+
195
+ # 容差 0.1秒
196
+ if cached_filter == current_filter and cached_size == current_size and abs(cached_mtime - current_mtime) < 0.1:
197
+ return True
198
+ else:
199
+ logger.debug(f"校验失败: 缓存Filter={cached_filter} vs 当前={current_filter}")
200
+ return False
201
+
202
+ except sqlite3.OperationalError:
203
+ return False
204
+ except Exception as e:
205
+ logger.warning(f"缓存校验出错: {e},将重新解析")
206
+ return False
153
207
 
154
208
  @staticmethod
155
- def extract_json_file(file_name: str, display_filter: str, tshark_path: str, tshark_work_dir: str, json_work_path: str) -> None:
209
+ def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
210
+ """流式解析并存入DB,同时记录元数据"""
211
+
212
+ if os.path.exists(db_path):
213
+ os.remove(db_path)
214
+
215
+ with sqlite3.connect(db_path) as conn:
216
+ cursor = conn.cursor()
217
+ cursor.execute("PRAGMA synchronous = OFF")
218
+ cursor.execute("PRAGMA journal_mode = MEMORY")
219
+
220
+ cursor.execute("CREATE TABLE requests (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, full_uri TEXT, time_epoch REAL)")
221
+ cursor.execute("CREATE TABLE responses (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, time_epoch REAL, request_in INTEGER)")
222
+
223
+ cursor.execute("""
224
+ CREATE TABLE meta_info (
225
+ id INTEGER PRIMARY KEY,
226
+ filter TEXT,
227
+ pcap_path TEXT,
228
+ pcap_mtime REAL,
229
+ pcap_size INTEGER
230
+ )
231
+ """)
232
+ conn.commit()
233
+
156
234
  command = [
157
235
  tshark_path,
158
- "-r", file_name,
159
- "-Y", f"({display_filter})",
160
- "-T", "json",
161
- "-e", "http.response.code",
162
- "-e", "http.request_in",
163
- "-e", "tcp.reassembled.data",
164
- "-e", "frame.number",
165
- "-e", "tcp.payload",
166
- "-e", "frame.time_epoch",
167
- "-e", "exported_pdu.exported_pdu",
168
- "-e", "http.request.full_uri",
236
+ "-r",
237
+ pcap_path,
238
+ "-Y",
239
+ f"({display_filter})",
240
+ "-T",
241
+ "json",
242
+ "-e",
243
+ "http.response.code",
244
+ "-e",
245
+ "http.request_in",
246
+ "-e",
247
+ "tcp.reassembled.data",
248
+ "-e",
249
+ "frame.number",
250
+ "-e",
251
+ "tcp.payload",
252
+ "-e",
253
+ "frame.time_epoch",
254
+ "-e",
255
+ "exported_pdu.exported_pdu",
256
+ "-e",
257
+ "http.request.full_uri",
169
258
  ]
170
- logger.debug(f"导出Json命令: {command}")
171
-
172
- with open(json_work_path, "wb") as output_file:
173
- process = subprocess.Popen(
174
- command,
175
- stdout=output_file,
176
- stderr=subprocess.PIPE,
177
- cwd=tshark_work_dir
178
- )
179
- _, stderr = process.communicate()
180
- logger.debug(f"导出Json文件路径: {json_work_path}")
181
-
182
- if stderr and b"WARNING" not in stderr:
183
- try:
184
- print(f"[Warning/Error]: {stderr.decode('utf-8')}")
185
- except Exception:
186
- print(f"[Warning/Error]: {stderr.decode('gbk')}")
187
259
 
188
- @staticmethod
189
- def add_md5sum(json_work_path: str, md5_sum: str) -> None:
190
- with open(json_work_path, "r", encoding="utf-8") as f:
191
- data = json.load(f)
192
- data[0]["MD5Sum"] = md5_sum
260
+ logger.debug(f"执行 Tshark: {command}")
193
261
 
194
- with open(json_work_path, "w", encoding="utf-8") as f:
195
- json.dump(data, f, indent=2)
262
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)))
196
263
 
197
- @staticmethod
198
- def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
199
- # sourcery skip: replace-interpolation-with-fstring
200
- """获取JSON数据并保存至文件,保存目录是当前工作目录,也就是您运行脚本所在目录
201
-
202
- Parameters
203
- ----------
204
- file_path : str
205
- 待处理的数据文件路径
206
- display_filter : str
207
- WireShark的显示过滤器
208
-
209
- Returns
210
- -------
211
- str
212
- 保存JSON数据的文件路径
213
- """
214
- if not os.path.exists(file_path):
215
- raise FileNotFoundError("您的填写的流量包没有找到!流量包路径:%s" % file_path)
216
-
217
- md5_sum = FlowAnalyzer.get_hash(file_path, display_filter)
218
- logger.debug(f"md5校验值: {md5_sum}")
219
-
220
- work_dir = os.getcwd()
221
- tshark_command_work_dir = os.path.dirname(os.path.abspath(file_path))
222
- json_work_path = os.path.join(work_dir, "output.json")
223
- file_name = os.path.basename(file_path)
224
-
225
- if os.path.exists(json_work_path):
226
- try:
227
- with open(json_work_path, "r", encoding="utf-8") as f:
228
- data = json.load(f)
229
- if data[0].get("MD5Sum") == md5_sum:
230
- logger.debug("匹配md5校验无误,自动返回Json文件路径!")
231
- return json_work_path
232
- except Exception:
233
- logger.debug("默认的Json文件无法被正常解析, 正在重新生成Json文件中")
234
-
235
- tshark_path = FlowAnalyzer.get_tshark_path(tshark_path)
236
- FlowAnalyzer.extract_json_file(file_name, display_filter, tshark_path, tshark_command_work_dir, json_work_path)
237
- FlowAnalyzer.add_md5sum(json_work_path, md5_sum)
238
- return json_work_path
264
+ db_req_rows = []
265
+ db_resp_rows = []
266
+ BATCH_SIZE = 5000
239
267
 
240
- @staticmethod
241
- def get_tshark_path(tshark_path: Optional[str]) -> str:
242
- default_tshark_path = get_default_tshark_path()
243
- if not os.path.exists(default_tshark_path):
244
- logger.debug("没有检测到tshark存在, 请查看并检查tshark_path")
245
- else:
246
- logger.debug("检测到默认tshark存在!")
268
+ try:
269
+ parser = ijson.items(process.stdout, "item")
247
270
 
248
- if tshark_path is None:
249
- logger.debug("您没有传入tshark_path, 请传入tshark_path")
250
- elif not os.path.exists(tshark_path):
251
- logger.debug("传入的tshark_path不存在, 请查看并检查tshark_path")
271
+ with sqlite3.connect(db_path) as conn:
272
+ cursor = conn.cursor()
252
273
 
253
- use_tshark_path = None
254
- if os.path.exists(default_tshark_path):
255
- use_tshark_path = default_tshark_path
274
+ for packet in parser:
275
+ layers = packet.get("_source", {}).get("layers", {})
276
+ if not layers:
277
+ continue
256
278
 
257
- if tshark_path is not None and os.path.exists(tshark_path):
258
- use_tshark_path = tshark_path
279
+ try:
280
+ frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(layers)
281
+ if not full_request:
282
+ continue
283
+ header, file_data = FlowAnalyzer.extract_http_file_data(full_request)
259
284
 
260
- if use_tshark_path is None:
261
- logger.critical("您没有配置 tshark_path 并且没有在参数中传入 tshark_path")
262
- exit(-1)
263
- return use_tshark_path
285
+ if layers.get("http.response.code"):
286
+ db_resp_rows.append((frame_num, header, file_data, time_epoch, request_in))
287
+ else:
288
+ db_req_rows.append((frame_num, header, file_data, full_uri, time_epoch))
289
+
290
+ if len(db_req_rows) >= BATCH_SIZE:
291
+ cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
292
+ db_req_rows.clear()
293
+ if len(db_resp_rows) >= BATCH_SIZE:
294
+ cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
295
+ db_resp_rows.clear()
296
+
297
+ except Exception:
298
+ pass
299
+
300
+ if db_req_rows:
301
+ cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
302
+ if db_resp_rows:
303
+ cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
304
+
305
+ pcap_mtime = os.path.getmtime(pcap_path)
306
+ pcap_size = os.path.getsize(pcap_path)
307
+ cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
308
+
309
+ conn.commit()
310
+
311
+ except Exception as e:
312
+ logger.error(f"解析错误: {e}")
313
+ if process.poll() is None:
314
+ process.terminate()
315
+ finally:
316
+ if process.poll() is None:
317
+ process.terminate()
264
318
 
265
- def split_http_headers(self, file_data: bytes) -> Tuple[bytes, bytes]:
319
+ # --- 辅助静态方法 ---
320
+
321
+ @staticmethod
322
+ def parse_packet_data(packet: dict) -> Tuple[int, int, float, str, str]:
323
+ frame_num = int(packet["frame.number"][0])
324
+ request_in = int(packet["http.request_in"][0]) if packet.get("http.request_in") else frame_num
325
+ full_uri = parse.unquote(packet["http.request.full_uri"][0]) if packet.get("http.request.full_uri") else ""
326
+ time_epoch = float(packet["frame.time_epoch"][0])
327
+
328
+ if packet.get("tcp.reassembled.data"):
329
+ full_request = packet["tcp.reassembled.data"][0]
330
+ elif packet.get("tcp.payload"):
331
+ full_request = packet["tcp.payload"][0]
332
+ else:
333
+ full_request = packet["exported_pdu.exported_pdu"][0] if packet.get("exported_pdu.exported_pdu") else ""
334
+ return frame_num, request_in, time_epoch, full_uri, full_request
335
+
336
+ @staticmethod
337
+ def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
266
338
  headerEnd = file_data.find(b"\r\n\r\n")
267
339
  if headerEnd != -1:
268
- headerEnd += 4
269
- return file_data[:headerEnd], file_data[headerEnd:]
340
+ return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
270
341
  elif file_data.find(b"\n\n") != -1:
271
342
  headerEnd = file_data.index(b"\n\n") + 2
272
343
  return file_data[:headerEnd], file_data[headerEnd:]
273
- else:
274
- print("[Warning] 没有找到headers和response的划分位置!")
275
- return b"", file_data
276
-
277
- def dechunck_http_response(self, file_data: bytes) -> bytes:
278
- """解码分块TCP数据
279
-
280
- Parameters
281
- ----------
282
- file_data : bytes
283
- 已经切割掉headers的TCP数据
344
+ return b"", file_data
284
345
 
285
- Returns
286
- -------
287
- bytes
288
- 解码分块后的TCP数据
346
+ @staticmethod
347
+ def dechunck_http_response(file_data: bytes) -> bytes:
348
+ """解码分块TCP数据 (修复版)
349
+ 注意:如果数据不是 Chunked 格式,此函数必须抛出异常,
350
+ 以便外层逻辑回退到使用原始数据。
289
351
  """
352
+ if not file_data:
353
+ return b""
354
+
290
355
  chunks = []
291
- chunkSizeEnd = file_data.find(b"\n") + 1
292
- lineEndings = b"\r\n" if bytes([file_data[chunkSizeEnd - 2]]) == b"\r" else b"\n"
293
- lineEndingsLength = len(lineEndings)
294
- while True:
295
- chunkSize = int(file_data[:chunkSizeEnd], 16)
296
- if not chunkSize:
356
+ cursor = 0
357
+ total_len = len(file_data)
358
+
359
+ while cursor < total_len:
360
+ # 1. 寻找当前 Chunk Size 行的结束符 (\n)
361
+ newline_idx = file_data.find(b"\n", cursor)
362
+ if newline_idx == -1:
363
+ # 找不到换行符,说明格式不对,抛出异常让外层处理
364
+ raise ValueError("Not chunked data")
365
+
366
+ # 2. 提取并解析十六进制大小
367
+ size_line = file_data[cursor:newline_idx].strip()
368
+
369
+ # 处理可能的空行 (例如上一个 Chunk 后的 CRLF)
370
+ if not size_line:
371
+ cursor = newline_idx + 1
372
+ continue
373
+
374
+ # 这里不要捕获 ValueError,如果解析失败,直接抛出
375
+ # 说明这根本不是 chunk size,而是普通数据
376
+ chunk_size = int(size_line, 16)
377
+
378
+ # Chunk Size 为 0 表示传输结束
379
+ if chunk_size == 0:
297
380
  break
298
381
 
299
- chunks.append(file_data[chunkSizeEnd : chunkSize + chunkSizeEnd])
300
- file_data = file_data[chunkSizeEnd + chunkSize + lineEndingsLength :]
301
- chunkSizeEnd = file_data.find(lineEndings) + lineEndingsLength
302
- return b"".join(chunks)
382
+ # 3. 定位数据区域
383
+ data_start = newline_idx + 1
384
+ data_end = data_start + chunk_size
303
385
 
304
- def extract_http_file_data(self, full_request: str) -> Tuple[bytes, bytes]:
305
- """提取HTTP请求或响应中的文件数据
386
+ if data_end > total_len:
387
+ # 数据被截断,尽力读取
388
+ chunks.append(file_data[data_start:])
389
+ break
306
390
 
307
- Parameters
308
- ----------
309
- full_request : bytes
310
- HTTP请求或响应的原始字节流
391
+ # 4. 提取数据
392
+ chunks.append(file_data[data_start:data_end])
311
393
 
312
- Returns
313
- -------
314
- tuple
315
- 包含header和file_data的元组
316
- """
317
- header, file_data = self.split_http_headers(bytes.fromhex(full_request))
394
+ # 5. 移动游标
395
+ cursor = data_end
396
+ # 跳过尾随的 \r 和 \n
397
+ while cursor < total_len and file_data[cursor] in (13, 10):
398
+ cursor += 1
399
+
400
+ return b"".join(chunks)
318
401
 
319
- with contextlib.suppress(Exception):
320
- file_data = self.dechunck_http_response(file_data)
402
+ @staticmethod
403
+ def extract_http_file_data(full_request: str) -> Tuple[bytes, bytes]:
404
+ """提取HTTP请求或响应中的文件数据 (修复版)"""
405
+ # 1. 基础校验
406
+ if not full_request:
407
+ return b"", b""
408
+
409
+ try:
410
+ # 转为二进制
411
+ raw_bytes = bytes.fromhex(full_request)
412
+
413
+ # 分割 Header 和 Body
414
+ header, file_data = FlowAnalyzer.split_http_headers(raw_bytes)
415
+
416
+ # 处理 Chunked 编码
417
+ with contextlib.suppress(Exception):
418
+ file_data = FlowAnalyzer.dechunck_http_response(file_data)
419
+
420
+ # 处理 Gzip 压缩
421
+ with contextlib.suppress(Exception):
422
+ if file_data.startswith(b"\x1f\x8b"):
423
+ file_data = gzip.decompress(file_data)
424
+
425
+ return header, file_data
426
+
427
+ except ValueError as e:
428
+ # 专门捕获 Hex 转换错误,并打印出来,方便你调试
429
+ # 如果你在控制台看到这个错误,说明 Tshark 输出的数据格式非常奇怪
430
+ logger.error(f"Hex转换失败: {str(e)[:100]}... 原数据片段: {full_request[:50]}")
431
+ return b"", b""
432
+ except Exception as e:
433
+ logger.error(f"解析HTTP数据未知错误: {e}")
434
+ return b"", b""
321
435
 
322
- with contextlib.suppress(Exception):
323
- if file_data.startswith(b"\x1F\x8B"):
324
- file_data = gzip.decompress(file_data)
325
- return header, file_data
436
+ @staticmethod
437
+ def get_tshark_path(tshark_path: Optional[str]) -> str:
438
+ default_tshark_path = get_default_tshark_path()
439
+ use_path = tshark_path if tshark_path and os.path.exists(tshark_path) else default_tshark_path
440
+ if not use_path or not os.path.exists(use_path):
441
+ logger.critical("未找到 Tshark,请检查路径配置")
442
+ exit(-1)
443
+ return use_path
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.4
2
+ Name: FlowAnalyzer
3
+ Version: 0.4.1
4
+ Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
+ Home-page: https://github.com/Byxs20/FlowAnalyzer
6
+ Author: Byxs20
7
+ Author-email: 97766819@qq.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.6
13
+ Classifier: Programming Language :: Python :: 3.7
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: license-file
25
+ Dynamic: summary
26
+
27
+ # FlowAnalyzer
28
+
29
+ [![PyPI version](https://img.shields.io/pypi/v/FlowAnalyzer.svg)](https://pypi.org/project/FlowAnalyzer/) [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) ![Python 3.6+](https://img.shields.io/badge/python-3.6+-blue.svg)
30
+
31
+ **FlowAnalyzer** 是一个高效的 Python 流量分析库,基于 `Tshark` 进行底层解析。它专为处理大流量包(Large PCAP)设计,采用流式解析与 SQLite 缓存架构,彻底解决内存溢出问题,实现秒级二次加载。
32
+
33
+ ---
34
+
35
+ ## 🚀 核心特性:智能缓存与流式架构
36
+
37
+ 为了解决传统解析方式慢、内存占用高的问题,FlowAnalyzer 进行了核心架构升级:**流式解析 + SQLite 智能缓存**。
38
+
39
+ ### 1. ⚡️ 高性能流式解析
40
+ - **极低内存占用**:不再将整个 JSON 读入内存。通过 `subprocess` 管道对接 Tshark 输出,结合 `ijson` 进行增量解析。
41
+ - **无中间文件**:解析过程中不生成体积巨大的临时 JSON 文件,直接入库。
42
+
43
+ ### 2. 💾 智能缓存机制
44
+ - **自动缓存**:首次分析 `test.pcap` 时,会自动生成同级目录下的 `test.db`。
45
+ - **秒级加载**:二次分析时,直接读取 SQLite 数据库,跳过漫长的 Tshark 解析过程(速度提升 100 倍+)。
46
+
47
+ ### 3. 🛡️ 智能校验 (Smart Validation)
48
+ 为了防止“修改了过滤规则但误读旧缓存”的问题,内置了严格的元数据校验机制。每次运行时自动比对指纹:
49
+
50
+ | 校验项 | 说明 |
51
+ | :---------------------- | :----------------------------------------------------------- |
52
+ | **过滤规则 (Filter)** | 检查本次传入的 Tshark 过滤器(如 `http contains flag`)是否与缓存一致。 |
53
+ | **文件指纹 (Metadata)** | 检查原始 PCAP 文件的 **修改时间 (MTime)** 和 **文件大小 (Size)**。 |
54
+
55
+ - ✅ **命中缓存**:规则一致且文件未变 → **0秒等待,直接加载**。
56
+ - 🔄 **缓存失效**:规则变更或文件更新 → **自动重新解析并更新数据库**。
57
+
58
+ ### 4. 性能对比
59
+
60
+ | 特性 | 旧版架构 | **新版架构 (FlowAnalyzer)** |
61
+ | :----------- | :---------------------------- | :---------------------------------- |
62
+ | **解析流程** | 生成巨大 JSON -> 全量读入内存 | Tshark流 -> 管道 -> ijson -> SQLite |
63
+ | **内存占用** | 极高 (易 OOM) | **极低 (内存稳定)** |
64
+ | **二次加载** | 需重新解析 | **直接读取 DB (0秒)** |
65
+ | **磁盘占用** | 巨大的临时 JSON 文件 | 轻量级 SQLite 文件 |
66
+
67
+ ---
68
+
69
+ ## 📦 安装
70
+
71
+ 请确保您的环境中已安装 Python 3 和 Tshark (Wireshark)。
72
+
73
+ ```bash
74
+ # 安装 FlowAnalyzer 及其依赖 ijson
75
+ pip3 install FlowAnalyzer ijson
76
+
77
+ # 或者使用国内源加速
78
+ pip3 install FlowAnalyzer ijson -i https://pypi.org/simple
79
+ ```
80
+
81
+ ---
82
+
83
+ ## 🛠️ 快速上手
84
+
85
+ ### 1. 基础使用
86
+
87
+ ```python
88
+ from FlowAnalyzer import FlowAnalyzer
89
+
90
+ # 流量包路径
91
+ pcap_path = r"tests/demo.pcap"
92
+ # 过滤规则
93
+ display_filter = "http"
94
+
95
+ # 1. 获取数据库数据 (自动处理解析、缓存和校验)
96
+ # 返回的是生成的 .db 文件路径
97
+ db_path = FlowAnalyzer.get_db_data(pcap_path, display_filter)
98
+ # 兼容老的函数名 get_json_data
99
+ # db_path = FlowAnalyzer.get_json_data(pcap_path, display_filter)
100
+
101
+ # 2. 初始化分析器
102
+ analyzer = FlowAnalyzer(db_path)
103
+
104
+ # 3. 遍历 HTTP 流
105
+ print("[+] 开始分析 HTTP 流...")
106
+ for pair in analyzer.generate_http_dict_pairs():
107
+ if pair.request:
108
+ print(f"Frame: {pair.request.frame_num} | URI: {pair.request.full_uri}")
109
+ # 获取请求体数据
110
+ # print(pair.request.file_data)
111
+ ```
112
+
113
+ ### 2. 配置 Tshark 路径
114
+
115
+ 如果您的 `tshark` 不在系统环境变量中,程序可能会报错。您有两种方式进行配置:
116
+
117
+ **方法一:代码中指定 (推荐)**
118
+
119
+ 在调用 `get_db_data` 时直接传入路径:
120
+
121
+ ```python
122
+ tshark_ex = r"D:\Program Files\Wireshark\tshark.exe"
123
+
124
+ FlowAnalyzer.get_db_data(pcap_path, display_filter, tshark_path=tshark_ex)
125
+ ```
126
+
127
+ **方法二:修改默认配置**
128
+
129
+ 如果安装目录固定,可以修改库文件中的默认路径:
130
+ 找到 `python安装目录\Lib\site-packages\FlowAnalyzer\Path.py`,修改 `tshark_path` 变量。
131
+
132
+ ---
133
+
134
+ ## 📝 测试
135
+
136
+ ```bash
137
+ $ git clone https://github.com/Byxs20/FlowAnalyzer.git
138
+ $ cd ./FlowAnalyzer/
139
+ $ python tests/demo.py
140
+ ```
141
+
142
+ **运行预期结果:**
143
+
144
+ ```text
145
+ [+] 正在处理第1个HTTP流!
146
+ 序号: 2请求包, 请求头: b'POST /upload/php_eval_xor_base64.php HTTP/1.1 ...
147
+ ```
148
+
149
+ ---
150
+
151
+ ## 📄 License
152
+
153
+ This project is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,9 @@
1
+ FlowAnalyzer/FlowAnalyzer.py,sha256=ciuWFPQWQgYqjdL_u7ck4BNIsQNx00HLOjr6lSkfzMg,17348
2
+ FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
3
+ FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
4
+ FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
5
+ flowanalyzer-0.4.1.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
6
+ flowanalyzer-0.4.1.dist-info/METADATA,sha256=WD01CpYRDVbT8RA5GwTKYZPv8Fa06_-4ZuiTAa5SfeE,5767
7
+ flowanalyzer-0.4.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
8
+ flowanalyzer-0.4.1.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
9
+ flowanalyzer-0.4.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,71 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: FlowAnalyzer
3
- Version: 0.3.9
4
- Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
- Home-page: https://github.com/Byxs20/FlowAnalyzer
6
- Author: Byxs20
7
- Author-email: 97766819@qq.com
8
- Classifier: Development Status :: 3 - Alpha
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.6
13
- Classifier: Programming Language :: Python :: 3.7
14
- Classifier: Programming Language :: Python :: 3.8
15
- Classifier: Programming Language :: Python :: 3.9
16
- Description-Content-Type: text/markdown
17
- License-File: LICENSE
18
-
19
- # FlowAnalyzer
20
-
21
- # 安装
22
-
23
- 使用 `pip` 安装:
24
-
25
- ```
26
- pip3 install FlowAnalyzer
27
- ```
28
-
29
- ```
30
- pip3 install FlowAnalyzer -i https://pypi.org/simple
31
- ```
32
-
33
- # 快速上手
34
-
35
- ## 配置
36
-
37
- 如果您安装 `WireShark` 没有修改安装目录,默认 `tshark` 路径会如下:
38
-
39
- ```python
40
- # windows
41
- tshark_path = r"C:\Program Files\Wireshark\tshark.exe"
42
- ```
43
-
44
- `Linux`, `MacOS` 默认路径不清楚,需要看下面的**纠正路径**,**确定路径没有问题,那也无需任何配置即可使用!**
45
-
46
- ## 纠正路径
47
-
48
- 修改 `python安装目录\Lib\site-packages\FlowAnalyzer\Path.py` 中的变量 `tshark_path` 改为**tshark正确路径**
49
-
50
- ## 测试
51
-
52
- ```
53
- $ git clone https://github.com/Byxs20/FlowAnalyzer.git
54
- $ cd ./FlowAnalyzer/
55
- $ python tests\demo.py
56
- ```
57
-
58
- 运行结果:
59
-
60
- ```
61
- [+] 正在处理第1个HTTP流!
62
- 序号: 2请求包, 请求头: b'POST /upload/php_eval_xor_base64.php HTTP/1.1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0\r\n
63
- ...
64
- ```
65
-
66
- # Contributing
67
- Feel free to submit issues or pull requests if you have any suggestions, improvements, or bug reports.
68
-
69
- # License
70
-
71
- This project is licensed under the [MIT License.](LICENSE)
@@ -1,9 +0,0 @@
1
- FlowAnalyzer/FlowAnalyzer.py,sha256=ErHea4wQEeGmCgAmWr4xmEuKSSYfXE0kFe7It0xD6Is,12203
2
- FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
3
- FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
4
- FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
5
- FlowAnalyzer-0.3.9.dist-info/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
6
- FlowAnalyzer-0.3.9.dist-info/METADATA,sha256=OcwMs0sqeUmUv1Y-9NWDaGFswMupCLf-FuJYr68DQX8,1956
7
- FlowAnalyzer-0.3.9.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
8
- FlowAnalyzer-0.3.9.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
9
- FlowAnalyzer-0.3.9.dist-info/RECORD,,