FlowAnalyzer 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,47 +1,20 @@
1
- import contextlib
2
- import csv
3
- import gzip
4
1
  import os
5
2
  import sqlite3
6
3
  import subprocess
7
- from dataclasses import dataclass
8
- from typing import Iterable, NamedTuple, Optional, Tuple
9
- from urllib import parse
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from typing import Iterable, Optional
10
6
 
11
7
  from .logging_config import logger
8
+ from .Models import HttpPair, Request, Response
9
+ from .PacketParser import PacketParser
12
10
  from .Path import get_default_tshark_path
13
11
 
14
12
 
15
- @dataclass
16
- class Request:
17
- __slots__ = ("frame_num", "header", "file_data", "full_uri", "time_epoch")
18
- frame_num: int
19
- header: bytes
20
- file_data: bytes
21
- full_uri: str
22
- time_epoch: float
23
-
24
-
25
- @dataclass
26
- class Response:
27
- __slots__ = ("frame_num", "header", "file_data", "time_epoch", "_request_in")
28
- frame_num: int
29
- header: bytes
30
- file_data: bytes
31
- time_epoch: float
32
- _request_in: Optional[int]
33
-
34
-
35
- class HttpPair(NamedTuple):
36
- request: Optional[Request]
37
- response: Optional[Response]
38
-
39
-
40
13
  class FlowAnalyzer:
41
14
  """
42
15
  FlowAnalyzer 流量分析器 (智能缓存版)
43
16
  特点:
44
- 1. Tshark -> Pipe -> ijson -> SQLite (无中间JSON文件)
17
+ 1. Tshark -> Pipe -> ThreadPool -> SQLite
45
18
  2. 智能校验:自动比对 Filter 和文件修改时间,防止缓存错乱
46
19
  3. 存储优化:数据库文件生成在流量包同级目录下
47
20
  """
@@ -80,7 +53,6 @@ class FlowAnalyzer:
80
53
 
81
54
  # === 第一步:配对查询 ===
82
55
  # 利用 SQLite 的 LEFT JOIN 直接匹配请求和响应
83
- # 避免将所有数据加载到 Python 内存中
84
56
  sql_pair = """
85
57
  SELECT
86
58
  req.frame_num, req.header, req.file_data, req.full_uri, req.time_epoch, -- 0-4 (Request)
@@ -94,19 +66,15 @@ class FlowAnalyzer:
94
66
 
95
67
  # 流式遍历结果,内存占用极低
96
68
  for row in cursor:
97
- # 构建 Request 对象
98
- # 注意处理 NULL 情况,虽然 requests 表理论上不为空,但防万一用 or b''
99
69
  req = Request(frame_num=row[0], header=row[1] or b"", file_data=row[2] or b"", full_uri=row[3] or "", time_epoch=row[4])
100
70
 
101
71
  resp = None
102
- # 如果 row[5] (Response frame_num) 不为空,说明匹配到了响应
103
72
  if row[5] is not None:
104
73
  resp = Response(frame_num=row[5], header=row[6] or b"", file_data=row[7] or b"", time_epoch=row[8], _request_in=row[9])
105
74
 
106
75
  yield HttpPair(request=req, response=resp)
107
76
 
108
77
  # === 第二步:孤儿响应查询 ===
109
- # 找出那些有 request_in 但找不到对应 Request 的响应包
110
78
  sql_orphan = """
111
79
  SELECT frame_num, header, file_data, time_epoch, request_in
112
80
  FROM responses
@@ -126,33 +94,21 @@ class FlowAnalyzer:
126
94
  def get_json_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
127
95
  """
128
96
  获取数据路径 (智能校验版)。
129
-
130
- 逻辑:
131
- 1. 根据 PCAP 路径推算 DB 路径 (位于 PCAP 同级目录)。
132
- 2. 检查 DB 是否存在。
133
- 3. 检查 Filter 和文件元数据是否一致。
134
- 4. 若一致返回路径,不一致则重新解析。
135
97
  """
136
98
  if not os.path.exists(file_path):
137
99
  raise FileNotFoundError("流量包路径不存在:%s" % file_path)
138
100
 
139
- # --- 修改处:获取流量包的绝对路径和所在目录 ---
140
101
  abs_file_path = os.path.abspath(file_path)
141
- pcap_dir = os.path.dirname(abs_file_path) # 获取文件所在的文件夹
102
+ pcap_dir = os.path.dirname(abs_file_path)
142
103
  base_name = os.path.splitext(os.path.basename(abs_file_path))[0]
143
-
144
- # 将 db_path 拼接在流量包所在的目录下
145
104
  db_path = os.path.join(pcap_dir, f"{base_name}.db")
146
- # ----------------------------------------
147
105
 
148
- # --- 校验环节 ---
149
106
  if FlowAnalyzer._is_cache_valid(db_path, abs_file_path, display_filter):
150
107
  logger.debug(f"缓存校验通过 (Filter匹配且文件未变),使用缓存: [{db_path}]")
151
108
  return db_path
152
109
  else:
153
110
  logger.debug(f"缓存失效或不存在 (Filter变更或文件更新),开始重新解析...")
154
111
 
155
- # --- 解析环节 ---
156
112
  tshark_path = FlowAnalyzer.get_tshark_path(tshark_path)
157
113
  FlowAnalyzer._stream_tshark_to_db(abs_file_path, display_filter, tshark_path, db_path)
158
114
 
@@ -160,17 +116,10 @@ class FlowAnalyzer:
160
116
 
161
117
  @staticmethod
162
118
  def get_db_data(file_path: str, display_filter: str, tshark_path: Optional[str] = None) -> str:
163
- """
164
- 获取数据库路径 (get_json_data 的语义化别名)。
165
- 新项目建议使用此方法名,get_json_data 保留用于兼容旧习惯。
166
- """
167
119
  return FlowAnalyzer.get_json_data(file_path, display_filter, tshark_path)
168
120
 
169
121
  @staticmethod
170
122
  def _is_cache_valid(db_path: str, pcap_path: str, current_filter: str) -> bool:
171
- """
172
- 检查缓存有效性:对比 Filter 字符串和文件元数据
173
- """
174
123
  if not os.path.exists(db_path) or os.path.getsize(db_path) == 0:
175
124
  return False
176
125
 
@@ -188,7 +137,6 @@ class FlowAnalyzer:
188
137
 
189
138
  cached_filter, cached_mtime, cached_size = row
190
139
 
191
- # 容差 0.1秒
192
140
  if cached_filter == current_filter and cached_size == current_size and abs(cached_mtime - current_mtime) < 0.1:
193
141
  return True
194
142
  else:
@@ -203,16 +151,7 @@ class FlowAnalyzer:
203
151
 
204
152
  @staticmethod
205
153
  def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
206
- """流式解析并存入DB,同时记录元数据"""
207
- # 增加 CSV 字段大小限制,防止超大包报错
208
- # 将限制设置为系统最大值,注意 32位系统不要超过 2GB (但 Python int通常是动态的,保险起见设大一点)
209
- # Windows下 sys.maxsize 通常足够大
210
- try:
211
- csv.field_size_limit(500 * 1024 * 1024) # 500 MB
212
- except Exception:
213
- # 如果失败,尝试取最大值
214
- csv.field_size_limit(int(2**31 - 1))
215
-
154
+ """流式解析并存入DB (多线程版)"""
216
155
  if os.path.exists(db_path):
217
156
  os.remove(db_path)
218
157
 
@@ -235,7 +174,6 @@ class FlowAnalyzer:
235
174
  """)
236
175
  conn.commit()
237
176
 
238
- # 修改命令为 -T fields 模式
239
177
  command = [
240
178
  tshark_path,
241
179
  "-r",
@@ -244,7 +182,6 @@ class FlowAnalyzer:
244
182
  f"({display_filter})",
245
183
  "-T",
246
184
  "fields",
247
- # 指定输出字段
248
185
  "-e",
249
186
  "http.response.code", # 0
250
187
  "-e",
@@ -261,81 +198,91 @@ class FlowAnalyzer:
261
198
  "exported_pdu.exported_pdu", # 6
262
199
  "-e",
263
200
  "http.request.full_uri", # 7
264
- # 格式控制
201
+ "-e",
202
+ "tcp.segment.count", # 8
265
203
  "-E",
266
- "header=n", # 不输出表头
204
+ "header=n",
267
205
  "-E",
268
- "separator=|", # 使用 | 分割 (比逗号更安全)
206
+ "separator=/t",
269
207
  "-E",
270
- "quote=d", # 双引号包裹
208
+ "quote=n",
271
209
  "-E",
272
- "occurrence=f", # 每个字段只取第一个值 (First)
210
+ "occurrence=f",
273
211
  ]
274
212
 
275
213
  logger.debug(f"执行 Tshark: {command}")
214
+ BATCH_SIZE = 2000
215
+ MAX_PENDING_BATCHES = 20 # 控制内存中待处理的批次数量 (Backpressure)
276
216
 
277
- # 使用 utf-8 编码读取 stdout text mode
278
- process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)), encoding="utf-8", errors="replace")
279
-
280
- db_req_rows = []
281
- db_resp_rows = []
282
- BATCH_SIZE = 5000
217
+ # 使用 ThreadPoolExecutor 并行处理数据
218
+ max_workers = min(32, (os.cpu_count() or 1) + 4)
283
219
 
220
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)))
284
221
  try:
285
- # 使用 csv.reader 解析 stdout 流
286
- reader = csv.reader(process.stdout, delimiter="|", quotechar='"') # type: ignore
287
222
  with sqlite3.connect(db_path) as conn:
288
223
  cursor = conn.cursor()
289
224
 
290
- for row in reader:
291
- # row 是一个列表,对应上面的 -e 顺序
292
- # [code, req_in, reassembled, frame, payload, epoch, pdu, uri]
293
- if not row:
294
- continue
295
-
296
- try:
297
- # 解析数据
298
- frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(row)
225
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
226
+ current_batch = []
227
+ pending_futures = [] # List[Future]
299
228
 
300
- if not full_request:
301
- continue
229
+ def write_results_to_db(results):
230
+ """将一批处理好的结果写入数据库"""
231
+ if not results:
232
+ return
302
233
 
303
- header, file_data = FlowAnalyzer.extract_http_file_data(full_request)
234
+ db_req_rows = []
235
+ db_resp_rows = []
304
236
 
305
- # 判断是请求还是响应
306
- # http.response.code (index 0) 是否为空
307
- if row[0]:
308
- # Response
309
- db_resp_rows.append((frame_num, header, file_data, time_epoch, request_in))
310
- else:
311
- # Request
312
- db_req_rows.append((frame_num, header, file_data, full_uri, time_epoch))
237
+ for item in results:
238
+ if item["type"] == "response":
239
+ db_resp_rows.append((item["frame_num"], item["header"], item["file_data"], item["time_epoch"], item["request_in"]))
240
+ else:
241
+ db_req_rows.append((item["frame_num"], item["header"], item["file_data"], item["full_uri"], item["time_epoch"]))
313
242
 
314
- # 批量插入
315
- if len(db_req_rows) >= BATCH_SIZE:
243
+ if db_req_rows:
316
244
  cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
317
- db_req_rows.clear()
318
- if len(db_resp_rows) >= BATCH_SIZE:
245
+ if db_resp_rows:
319
246
  cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
320
- db_resp_rows.clear()
321
247
 
322
- except Exception as e:
323
- # 偶尔可能会有解析失败的行,跳过即可
324
- pass
248
+ def submit_batch():
249
+ """提交当前批次到线程池"""
250
+ if not current_batch:
251
+ return
325
252
 
326
- # 插入剩余数据
327
- if db_req_rows:
328
- cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
329
- if db_resp_rows:
330
- cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
253
+ # Copy batch data for the thread (list slicing is fast)
254
+ batch_data = current_batch[:]
255
+ future = executor.submit(PacketParser.process_batch, batch_data)
256
+ pending_futures.append(future)
257
+ current_batch.clear()
331
258
 
332
- # --- 优化点:插入完数据后再创建索引,速度更快 ---
333
- cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
259
+ # --- Main Pipeline Loop ---
260
+ if process.stdout:
261
+ for line in process.stdout:
262
+ current_batch.append(line)
263
+
264
+ if len(current_batch) >= BATCH_SIZE:
265
+ submit_batch()
266
+
267
+ # Backpressure: 如果积压的任务太多,主线程暂停读取,先处理掉最早的一个
268
+ # 这样既保证了 Pipeline 流动,又防止内存爆掉
269
+ if len(pending_futures) >= MAX_PENDING_BATCHES:
270
+ oldest_future = pending_futures.pop(0)
271
+ write_results_to_db(oldest_future.result())
272
+
273
+ # --- Drain Pipeline ---
274
+ # 提交剩余数据
275
+ submit_batch()
276
+
277
+ # 等待所有剩余任务完成
278
+ for future in pending_futures:
279
+ write_results_to_db(future.result())
334
280
 
281
+ # 创建索引和元数据
282
+ cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
335
283
  pcap_mtime = os.path.getmtime(pcap_path)
336
284
  pcap_size = os.path.getsize(pcap_path)
337
285
  cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
338
-
339
286
  conn.commit()
340
287
 
341
288
  except Exception as e:
@@ -346,131 +293,6 @@ class FlowAnalyzer:
346
293
  if process.poll() is None:
347
294
  process.terminate()
348
295
 
349
- # --- 辅助静态方法 ---
350
-
351
- @staticmethod
352
- def parse_packet_data(row: list) -> Tuple[int, int, float, str, str]:
353
- # row definition:
354
- # 0: http.response.code
355
- # 1: http.request_in
356
- # 2: tcp.reassembled.data
357
- # 3: frame.number
358
- # 4: tcp.payload
359
- # 5: frame.time_epoch
360
- # 6: exported_pdu.exported_pdu
361
- # 7: http.request.full_uri
362
-
363
- frame_num = int(row[3])
364
- request_in = int(row[1]) if row[1] else frame_num
365
- full_uri = parse.unquote(row[7]) if row[7] else ""
366
- time_epoch = float(row[5])
367
-
368
- if row[2]:
369
- full_request = row[2]
370
- elif row[4]:
371
- full_request = row[4]
372
- else:
373
- full_request = row[6] if row[6] else ""
374
-
375
- return frame_num, request_in, time_epoch, full_uri, full_request
376
-
377
- @staticmethod
378
- def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
379
- headerEnd = file_data.find(b"\r\n\r\n")
380
- if headerEnd != -1:
381
- return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
382
- elif file_data.find(b"\n\n") != -1:
383
- headerEnd = file_data.index(b"\n\n") + 2
384
- return file_data[:headerEnd], file_data[headerEnd:]
385
- return b"", file_data
386
-
387
- @staticmethod
388
- def dechunck_http_response(file_data: bytes) -> bytes:
389
- """解码分块TCP数据"""
390
- if not file_data:
391
- return b""
392
-
393
- chunks = []
394
- cursor = 0
395
- total_len = len(file_data)
396
-
397
- while cursor < total_len:
398
- # 1. 寻找当前 Chunk Size 行的结束符 (\n)
399
- newline_idx = file_data.find(b"\n", cursor)
400
- if newline_idx == -1:
401
- # 找不到换行符,说明格式不对,抛出异常让外层处理
402
- raise ValueError("Not chunked data")
403
-
404
- # 2. 提取并解析十六进制大小
405
- size_line = file_data[cursor:newline_idx].strip()
406
-
407
- # 处理可能的空行 (例如上一个 Chunk 后的 CRLF)
408
- if not size_line:
409
- cursor = newline_idx + 1
410
- continue
411
-
412
- # 这里不要捕获 ValueError,如果解析失败,直接抛出
413
- # 说明这根本不是 chunk size,而是普通数据
414
- chunk_size = int(size_line, 16)
415
-
416
- # Chunk Size 为 0 表示传输结束
417
- if chunk_size == 0:
418
- break
419
-
420
- # 3. 定位数据区域
421
- data_start = newline_idx + 1
422
- data_end = data_start + chunk_size
423
-
424
- if data_end > total_len:
425
- # 数据被截断,尽力读取
426
- chunks.append(file_data[data_start:])
427
- break
428
-
429
- # 4. 提取数据
430
- chunks.append(file_data[data_start:data_end])
431
-
432
- # 5. 移动游标
433
- cursor = data_end
434
- # 跳过尾随的 \r 和 \n
435
- while cursor < total_len and file_data[cursor] in (13, 10):
436
- cursor += 1
437
-
438
- return b"".join(chunks)
439
-
440
- @staticmethod
441
- def extract_http_file_data(full_request: str) -> Tuple[bytes, bytes]:
442
- """提取HTTP请求或响应中的文件数据 (修复版)"""
443
- # 1. 基础校验
444
- if not full_request:
445
- return b"", b""
446
-
447
- try:
448
- # 转为二进制
449
- raw_bytes = bytes.fromhex(full_request)
450
-
451
- # 分割 Header 和 Body
452
- header, file_data = FlowAnalyzer.split_http_headers(raw_bytes)
453
-
454
- # 处理 Chunked 编码
455
- with contextlib.suppress(Exception):
456
- file_data = FlowAnalyzer.dechunck_http_response(file_data)
457
-
458
- # 处理 Gzip 压缩
459
- with contextlib.suppress(Exception):
460
- if file_data.startswith(b"\x1f\x8b"):
461
- file_data = gzip.decompress(file_data)
462
-
463
- return header, file_data
464
-
465
- except ValueError as e:
466
- # 专门捕获 Hex 转换错误,并打印出来,方便你调试
467
- # 如果你在控制台看到这个错误,说明 Tshark 输出的数据格式非常奇怪
468
- logger.error(f"Hex转换失败: {str(e)[:100]}... 原数据片段: {full_request[:50]}")
469
- return b"", b""
470
- except Exception as e:
471
- logger.error(f"解析HTTP数据未知错误: {e}")
472
- return b"", b""
473
-
474
296
  @staticmethod
475
297
  def get_tshark_path(tshark_path: Optional[str]) -> str:
476
298
  default_tshark_path = get_default_tshark_path()
FlowAnalyzer/Models.py ADDED
@@ -0,0 +1,27 @@
1
+ from dataclasses import dataclass
2
+ from typing import NamedTuple, Optional
3
+
4
+
5
+ @dataclass
6
+ class Request:
7
+ __slots__ = ("frame_num", "header", "file_data", "full_uri", "time_epoch")
8
+ frame_num: int
9
+ header: bytes
10
+ file_data: bytes
11
+ full_uri: str
12
+ time_epoch: float
13
+
14
+
15
+ @dataclass
16
+ class Response:
17
+ __slots__ = ("frame_num", "header", "file_data", "time_epoch", "_request_in")
18
+ frame_num: int
19
+ header: bytes
20
+ file_data: bytes
21
+ time_epoch: float
22
+ _request_in: Optional[int]
23
+
24
+
25
+ class HttpPair(NamedTuple):
26
+ request: Optional[Request]
27
+ response: Optional[Response]
@@ -0,0 +1,183 @@
1
+ import binascii
2
+ import contextlib
3
+ import gzip
4
+ from typing import List, Optional, Tuple
5
+ from urllib import parse
6
+
7
+ from .logging_config import logger
8
+
9
+
10
+ class PacketParser:
11
+ @staticmethod
12
+ def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
13
+ """
14
+ 解析 Tshark 输出的一行数据
15
+ row definition (all bytes):
16
+ 0: http.response.code
17
+ 1: http.request_in
18
+ 2: tcp.reassembled.data
19
+ 3: frame.number
20
+ 4: tcp.payload
21
+ 5: frame.time_epoch
22
+ 6: exported_pdu.exported_pdu
23
+ 7: http.request.full_uri
24
+ 8: tcp.segment.count
25
+ """
26
+ frame_num = int(row[3])
27
+ request_in = int(row[1]) if row[1] else frame_num
28
+ # Decode only URI to string
29
+ full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
30
+ time_epoch = float(row[5])
31
+
32
+ # Logic for Raw Packet (Header Source)
33
+ # Previous index 9 is now 8 since we removed http.file_data
34
+ is_reassembled = len(row) > 8 and row[8]
35
+
36
+ if is_reassembled and row[2]:
37
+ full_request = row[2]
38
+ elif row[4]:
39
+ full_request = row[4]
40
+ else:
41
+ # Fallback (e.g. Exported PDU)
42
+ full_request = row[2] if row[2] else (row[6] if row[6] else b"")
43
+
44
+ return frame_num, request_in, time_epoch, full_uri, full_request
45
+
46
+ @staticmethod
47
+ def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
48
+ headerEnd = file_data.find(b"\r\n\r\n")
49
+ if headerEnd != -1:
50
+ return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
51
+ elif file_data.find(b"\n\n") != -1:
52
+ headerEnd = file_data.index(b"\n\n") + 2
53
+ return file_data[:headerEnd], file_data[headerEnd:]
54
+ return b"", file_data
55
+
56
+ @staticmethod
57
+ def dechunk_http_response(file_data: bytes) -> bytes:
58
+ """解码分块TCP数据"""
59
+ if not file_data:
60
+ return b""
61
+
62
+ chunks = []
63
+ cursor = 0
64
+ total_len = len(file_data)
65
+
66
+ while cursor < total_len:
67
+ newline_idx = file_data.find(b"\n", cursor)
68
+ if newline_idx == -1:
69
+ # If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
70
+ # But for robustness we might perform a "best effort" or just stop.
71
+ # raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
72
+ # Let's assume non-chunked if strict format not found
73
+ raise ValueError("Not chunked data")
74
+
75
+ size_line = file_data[cursor:newline_idx].strip()
76
+ if not size_line:
77
+ cursor = newline_idx + 1
78
+ continue
79
+
80
+ try:
81
+ chunk_size = int(size_line, 16)
82
+ except ValueError:
83
+ raise ValueError("Invalid chunk size")
84
+
85
+ if chunk_size == 0:
86
+ break
87
+
88
+ data_start = newline_idx + 1
89
+ data_end = data_start + chunk_size
90
+
91
+ # Robustness check
92
+ if data_start > total_len:
93
+ break
94
+
95
+ if data_end > total_len:
96
+ chunks.append(file_data[data_start:])
97
+ break
98
+
99
+ chunks.append(file_data[data_start:data_end])
100
+
101
+ cursor = data_end
102
+ # Skip CRLF after chunk data
103
+ while cursor < total_len and file_data[cursor] in (13, 10):
104
+ cursor += 1
105
+
106
+ return b"".join(chunks)
107
+
108
+ @staticmethod
109
+ def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
110
+ """
111
+ 提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
112
+ """
113
+ header = b""
114
+ file_data = b""
115
+
116
+ if not full_request:
117
+ return b"", b""
118
+ try:
119
+ raw_bytes = binascii.unhexlify(full_request)
120
+ header, body_part = PacketParser.split_http_headers(raw_bytes)
121
+
122
+ with contextlib.suppress(Exception):
123
+ body_part = PacketParser.dechunk_http_response(body_part)
124
+
125
+ with contextlib.suppress(Exception):
126
+ if body_part.startswith(b"\x1f\x8b"):
127
+ body_part = gzip.decompress(body_part)
128
+
129
+ file_data = body_part
130
+ return header, file_data
131
+
132
+ except binascii.Error:
133
+ logger.error("Hex转换失败")
134
+ return b"", b""
135
+ except Exception as e:
136
+ logger.error(f"解析HTTP数据未知错误: {e}")
137
+ return b"", b""
138
+
139
+ @staticmethod
140
+ def process_row(line: bytes) -> Optional[dict]:
141
+ """
142
+ 处理单行数据,返回结构化结果供主线程写入
143
+ """
144
+ line = line.rstrip(b"\r\n")
145
+ if not line:
146
+ return None
147
+
148
+ row = line.split(b"\t")
149
+ try:
150
+ frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
151
+
152
+ if not full_request:
153
+ return None
154
+
155
+ header, file_data = PacketParser.extract_http_file_data(full_request)
156
+
157
+ # row[0] is http.response.code (bytes)
158
+ is_response = bool(row[0])
159
+
160
+ return {
161
+ "type": "response" if is_response else "request",
162
+ "frame_num": frame_num,
163
+ "header": header,
164
+ "file_data": file_data,
165
+ "time_epoch": time_epoch,
166
+ "request_in": request_in, # Only useful for Response
167
+ "full_uri": full_uri, # Only useful for Request
168
+ }
169
+
170
+ except Exception:
171
+ return None
172
+
173
+ @staticmethod
174
+ def process_batch(lines: List[bytes]) -> List[dict]:
175
+ """
176
+ 批量处理行数据,减少函数调用开销
177
+ """
178
+ results = []
179
+ for line in lines:
180
+ res = PacketParser.process_row(line)
181
+ if res:
182
+ results.append(res)
183
+ return results
@@ -0,0 +1,128 @@
1
+ import os
2
+ import time
3
+ from collections import defaultdict
4
+ from typing import List, Tuple
5
+
6
+ import dpkt
7
+
8
+
9
+ class PcapSplitter:
10
+ """
11
+ Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
12
+ based on TCP flows, dynamically balanced for parallel processing.
13
+ """
14
+
15
+ def __init__(self, pcap_file: str, output_dir: str):
16
+ self.pcap_file = pcap_file
17
+ self.output_dir = output_dir
18
+
19
+ def get_stream_key(self, tcp, ip) -> Tuple:
20
+ """Generate a 5-tuple key for the flow."""
21
+ src = ip.src
22
+ dst = ip.dst
23
+ sport = tcp.sport
24
+ dport = tcp.dport
25
+ # Canonicalize bidirectional flows to the same key
26
+ key1 = (src, dst, sport, dport)
27
+ key2 = (dst, src, dport, sport)
28
+ return key1 if key1 < key2 else key2
29
+
30
+ def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
31
+ """
32
+ Split the pcap file into balanced chunks based on stream volume (bytes).
33
+ Uses a Greedy Partition Algorithm (Longest Processing Time first).
34
+
35
+ Args:
36
+ threshold_mb: File size threshold in MB. If smaller, do not split.
37
+ default_chunks: Number of chunks to split into if threshold is exceeded.
38
+
39
+ Returns:
40
+ List of generated file paths (or original file if not split).
41
+ """
42
+ if not os.path.exists(self.pcap_file):
43
+ raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
44
+
45
+ file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
46
+ if file_size_mb < threshold_mb:
47
+ print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
48
+ return [self.pcap_file]
49
+
50
+ os.makedirs(self.output_dir, exist_ok=True)
51
+
52
+ start_time = time.time()
53
+ # Dictionary to store packets: stream_key -> list of (ts, buf)
54
+ streams = defaultdict(list)
55
+ # Dictionary to store total size: stream_key -> total_bytes
56
+ stream_sizes = defaultdict(int)
57
+
58
+ # 1. Read and Group Packets
59
+ print(f"Reading {self.pcap_file}...")
60
+ with open(self.pcap_file, "rb") as f:
61
+ if self.pcap_file.lower().endswith(".pcapng"):
62
+ reader = dpkt.pcapng.Reader(f)
63
+ else:
64
+ reader = dpkt.pcap.Reader(f)
65
+
66
+ for ts, buf in reader:
67
+ try:
68
+ eth = dpkt.ethernet.Ethernet(buf)
69
+ if not isinstance(eth.data, dpkt.ip.IP):
70
+ continue
71
+ ip = eth.data
72
+ if not isinstance(ip.data, dpkt.tcp.TCP):
73
+ continue
74
+ tcp = ip.data
75
+
76
+ key = self.get_stream_key(tcp, ip)
77
+ streams[key].append((ts, buf))
78
+ stream_sizes[key] += len(buf)
79
+ except Exception:
80
+ continue
81
+
82
+ total_streams = len(streams)
83
+ print(f"Found {total_streams} TCP streams.")
84
+
85
+ if total_streams == 0:
86
+ print("No TCP streams found to split.")
87
+ return []
88
+
89
+ # 2. Assign Streams to Buckets (Greedy LPT Algorithm)
90
+ num_chunks = min(default_chunks, total_streams)
91
+
92
+ # Sort streams by size (descending)
93
+ sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
94
+
95
+ # Buckets: list of (current_size, batch_index, list_of_keys)
96
+ # We perform standard list sort to find min bucket, sufficient for small N
97
+ buckets = [[0, i, []] for i in range(num_chunks)]
98
+
99
+ for key, size in sorted_streams:
100
+ # Find bucket with smallest current size
101
+ buckets.sort(key=lambda x: x[0])
102
+ smallest_bucket = buckets[0]
103
+
104
+ # Add stream to this bucket
105
+ smallest_bucket[0] += size
106
+ smallest_bucket[2].append(key)
107
+
108
+ print(f"Splitting into {num_chunks} files with volume balancing...")
109
+ generated_files = []
110
+
111
+ # 3. Write Batches
112
+ # Sort buckets by index ensures file naming order 0, 1, 2...
113
+ buckets.sort(key=lambda x: x[1])
114
+
115
+ for size, i, batch_keys in buckets:
116
+ out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
117
+ generated_files.append(out_file_path)
118
+
119
+ with open(out_file_path, "wb") as f:
120
+ writer = dpkt.pcap.Writer(f)
121
+ for key in batch_keys:
122
+ for ts, buf in streams[key]:
123
+ writer.writepkt(buf, ts)
124
+
125
+ print(f" - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
126
+
127
+ print(f"Split completed in {time.time() - start_time:.2f}s")
128
+ return generated_files
@@ -15,8 +15,9 @@ def configure_logger(logger_name, level=logging.DEBUG) -> logging.Logger:
15
15
  console_handler.setFormatter(formatter)
16
16
  return logger
17
17
 
18
+
18
19
  logger = configure_logger("FlowAnalyzer", logging.INFO)
19
20
 
20
- if __name__ == '__main__':
21
+ if __name__ == "__main__":
21
22
  logger = configure_logger("FlowAnalyzer")
22
23
  logger.info("This is a test!")
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.3
3
+ Version: 0.4.5
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -15,6 +15,14 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Classifier: Programming Language :: Python :: 3.9
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: license-file
25
+ Dynamic: summary
18
26
 
19
27
  # FlowAnalyzer
20
28
 
@@ -28,9 +36,11 @@ License-File: LICENSE
28
36
 
29
37
  为了解决传统解析方式慢、内存占用高的问题,FlowAnalyzer 进行了核心架构升级:**流式解析 + SQLite 智能缓存**。
30
38
 
31
- ### 1. ⚡️ 高性能流式解析
32
- - **极低内存占用**:不再将整个 JSON 读入内存。通过 `subprocess` 管道对接 Tshark 输出,结合 `ijson` 进行增量解析。
33
- - **无中间文件**:解析过程中不生成体积巨大的临时 JSON 文件,直接入库。
39
+ ### 1. ⚡️ 高性能流式解析 (多线程流水线)
40
+ - **多线程并行**:采用 `ThreadPoolExecutor` 构建流水线,主线程负责读取 Tshark 输出,子线程并行解析数据包,充分利用多核 CPU。
41
+ - **批量处理**:引入 Batch 机制(默认 2000 包/批),大幅减少数据库事务开销和 Python 函数调用损耗。
42
+ - **内存背压控制 (Backpressure)**:智能监控待处理队列长度,防止在处理高速流量时内存溢出。
43
+ - **极低内存占用**:不再将整个 JSON 读入内存。通过 `subprocess` 管道流式处理,解析过程中不生成体积巨大的临时文件。
34
44
 
35
45
  ### 2. 💾 智能缓存机制
36
46
  - **自动缓存**:首次分析 `test.pcap` 时,会自动生成同级目录下的 `test.db`。
@@ -51,7 +61,7 @@ License-File: LICENSE
51
61
 
52
62
  | 特性 | 旧版架构 | **新版架构 (FlowAnalyzer)** |
53
63
  | :----------- | :---------------------------- | :---------------------------------- |
54
- | **解析流程** | 生成巨大 JSON -> 全量读入内存 | Tshark流 -> 管道 -> ijson -> SQLite |
64
+ | **解析流程** | 生成巨大 JSON -> 全量读入内存 | Tshark流 -> 多线程Batch解析 -> SQLite |
55
65
  | **内存占用** | 极高 (易 OOM) | **极低 (内存稳定)** |
56
66
  | **二次加载** | 需重新解析 | **直接读取 DB (0秒)** |
57
67
  | **磁盘占用** | 巨大的临时 JSON 文件 | 轻量级 SQLite 文件 |
@@ -63,11 +73,11 @@ License-File: LICENSE
63
73
  请确保您的环境中已安装 Python 3 和 Tshark (Wireshark)。
64
74
 
65
75
  ```bash
66
- # 安装 FlowAnalyzer 及其依赖 ijson
67
- pip3 install FlowAnalyzer ijson
76
+ # 安装 FlowAnalyzer
77
+ pip3 install FlowAnalyzer
68
78
 
69
79
  # 或者使用国内源加速
70
- pip3 install FlowAnalyzer ijson -i https://pypi.org/simple
80
+ pip3 install FlowAnalyzer -i https://pypi.org/simple
71
81
  ```
72
82
 
73
83
  ---
@@ -0,0 +1,12 @@
1
+ FlowAnalyzer/FlowAnalyzer.py,sha256=9SshWk5wf0XATI7W4eBiIpzEqeGFyQJs3on5ox-zrNQ,12666
2
+ FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
3
+ FlowAnalyzer/PacketParser.py,sha256=vdXUMFteSlIbOJ4y4_ikUIL3HwBCFBBgjevNL0jLozE,6174
4
+ FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
5
+ FlowAnalyzer/PcapSplitter.py,sha256=0E_vmLYYsE_gD34XTwG1XPx5kBg8ZchJspQEnkBoIdY,4855
6
+ FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
7
+ FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
8
+ flowanalyzer-0.4.5.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
9
+ flowanalyzer-0.4.5.dist-info/METADATA,sha256=oyoNqX8eZkkiNTrkz8qrZ6T7ofrW0lnFSxXoV2Q1wIU,6099
10
+ flowanalyzer-0.4.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
11
+ flowanalyzer-0.4.5.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
12
+ flowanalyzer-0.4.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.2)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- FlowAnalyzer/FlowAnalyzer.py,sha256=9seSOamepCnejHYRKLWym9Eu0lbxCgn7p3hE2WUZstk,18964
2
- FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
3
- FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
4
- FlowAnalyzer/logging_config.py,sha256=-RntNJhrBiW7ToXIP1WJjZ4Yf9jmZQ1PTX_er3tDxhw,730
5
- flowanalyzer-0.4.3.dist-info/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
6
- flowanalyzer-0.4.3.dist-info/METADATA,sha256=W6BhXCna1TYeTVd_gY5Q63xjbckhRpomHYErrtS5fBM,5588
7
- flowanalyzer-0.4.3.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
8
- flowanalyzer-0.4.3.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
9
- flowanalyzer-0.4.3.dist-info/RECORD,,