FlowAnalyzer 0.4.2__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import contextlib
2
+ import csv
2
3
  import gzip
3
4
  import os
4
5
  import sqlite3
@@ -7,8 +8,6 @@ from dataclasses import dataclass
7
8
  from typing import Iterable, NamedTuple, Optional, Tuple
8
9
  from urllib import parse
9
10
 
10
- import ijson
11
-
12
11
  from .logging_config import logger
13
12
  from .Path import get_default_tshark_path
14
13
 
@@ -205,6 +204,14 @@ class FlowAnalyzer:
205
204
  @staticmethod
206
205
  def _stream_tshark_to_db(pcap_path: str, display_filter: str, tshark_path: str, db_path: str):
207
206
  """流式解析并存入DB,同时记录元数据"""
207
+ # 增加 CSV 字段大小限制,防止超大包报错
208
+ # 将限制设置为系统最大值,注意 32位系统不要超过 2GB (但 Python int通常是动态的,保险起见设大一点)
209
+ # Windows下 sys.maxsize 通常足够大
210
+ try:
211
+ csv.field_size_limit(500 * 1024 * 1024) # 500 MB
212
+ except Exception:
213
+ # 如果失败,尝试取最大值
214
+ csv.field_size_limit(int(2**31 - 1))
208
215
 
209
216
  if os.path.exists(db_path):
210
217
  os.remove(db_path)
@@ -217,9 +224,6 @@ class FlowAnalyzer:
217
224
  cursor.execute("CREATE TABLE requests (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, full_uri TEXT, time_epoch REAL)")
218
225
  cursor.execute("CREATE TABLE responses (frame_num INTEGER PRIMARY KEY, header BLOB, file_data BLOB, time_epoch REAL, request_in INTEGER)")
219
226
 
220
- # === 核心优化:增加索引,极大加速 SQL JOIN 配对 ===
221
- cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
222
-
223
227
  cursor.execute("""
224
228
  CREATE TABLE meta_info (
225
229
  id INTEGER PRIMARY KEY,
@@ -231,6 +235,7 @@ class FlowAnalyzer:
231
235
  """)
232
236
  conn.commit()
233
237
 
238
+ # 修改命令为 -T fields 模式
234
239
  command = [
235
240
  tshark_path,
236
241
  "-r",
@@ -238,55 +243,75 @@ class FlowAnalyzer:
238
243
  "-Y",
239
244
  f"({display_filter})",
240
245
  "-T",
241
- "json",
246
+ "fields",
247
+ # 指定输出字段
242
248
  "-e",
243
- "http.response.code",
249
+ "http.response.code", # 0
244
250
  "-e",
245
- "http.request_in",
251
+ "http.request_in", # 1
246
252
  "-e",
247
- "tcp.reassembled.data",
253
+ "tcp.reassembled.data", # 2
248
254
  "-e",
249
- "frame.number",
255
+ "frame.number", # 3
250
256
  "-e",
251
- "tcp.payload",
257
+ "tcp.payload", # 4
252
258
  "-e",
253
- "frame.time_epoch",
259
+ "frame.time_epoch", # 5
254
260
  "-e",
255
- "exported_pdu.exported_pdu",
261
+ "exported_pdu.exported_pdu", # 6
256
262
  "-e",
257
- "http.request.full_uri",
263
+ "http.request.full_uri", # 7
264
+ # 格式控制
265
+ "-E",
266
+ "header=n", # 不输出表头
267
+ "-E",
268
+ "separator=|", # 使用 | 分割 (比逗号更安全)
269
+ "-E",
270
+ "quote=d", # 双引号包裹
271
+ "-E",
272
+ "occurrence=f", # 每个字段只取第一个值 (First)
258
273
  ]
259
274
 
260
275
  logger.debug(f"执行 Tshark: {command}")
261
276
 
262
- process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)))
277
+ # 使用 utf-8 编码读取 stdout text mode
278
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(pcap_path)), encoding="utf-8", errors="replace")
263
279
 
264
280
  db_req_rows = []
265
281
  db_resp_rows = []
266
282
  BATCH_SIZE = 5000
267
283
 
268
284
  try:
269
- parser = ijson.items(process.stdout, "item")
270
-
285
+ # 使用 csv.reader 解析 stdout
286
+ reader = csv.reader(process.stdout, delimiter="|", quotechar='"') # type: ignore
271
287
  with sqlite3.connect(db_path) as conn:
272
288
  cursor = conn.cursor()
273
289
 
274
- for packet in parser:
275
- layers = packet.get("_source", {}).get("layers", {})
276
- if not layers:
290
+ for row in reader:
291
+ # row 是一个列表,对应上面的 -e 顺序
292
+ # [code, req_in, reassembled, frame, payload, epoch, pdu, uri]
293
+ if not row:
277
294
  continue
278
295
 
279
296
  try:
280
- frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(layers)
297
+ # 解析数据
298
+ frame_num, request_in, time_epoch, full_uri, full_request = FlowAnalyzer.parse_packet_data(row)
299
+
281
300
  if not full_request:
282
301
  continue
302
+
283
303
  header, file_data = FlowAnalyzer.extract_http_file_data(full_request)
284
304
 
285
- if layers.get("http.response.code"):
305
+ # 判断是请求还是响应
306
+ # http.response.code (index 0) 是否为空
307
+ if row[0]:
308
+ # Response
286
309
  db_resp_rows.append((frame_num, header, file_data, time_epoch, request_in))
287
310
  else:
311
+ # Request
288
312
  db_req_rows.append((frame_num, header, file_data, full_uri, time_epoch))
289
313
 
314
+ # 批量插入
290
315
  if len(db_req_rows) >= BATCH_SIZE:
291
316
  cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
292
317
  db_req_rows.clear()
@@ -294,14 +319,19 @@ class FlowAnalyzer:
294
319
  cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
295
320
  db_resp_rows.clear()
296
321
 
297
- except Exception:
322
+ except Exception as e:
323
+ # 偶尔可能会有解析失败的行,跳过即可
298
324
  pass
299
325
 
326
+ # 插入剩余数据
300
327
  if db_req_rows:
301
328
  cursor.executemany("INSERT OR REPLACE INTO requests VALUES (?,?,?,?,?)", db_req_rows)
302
329
  if db_resp_rows:
303
330
  cursor.executemany("INSERT OR REPLACE INTO responses VALUES (?,?,?,?,?)", db_resp_rows)
304
331
 
332
+ # --- 优化点:插入完数据后再创建索引,速度更快 ---
333
+ cursor.execute("CREATE INDEX idx_resp_req_in ON responses(request_in)")
334
+
305
335
  pcap_mtime = os.path.getmtime(pcap_path)
306
336
  pcap_size = os.path.getsize(pcap_path)
307
337
  cursor.execute("INSERT INTO meta_info (filter, pcap_path, pcap_mtime, pcap_size) VALUES (?, ?, ?, ?)", (display_filter, pcap_path, pcap_mtime, pcap_size))
@@ -319,18 +349,29 @@ class FlowAnalyzer:
319
349
  # --- 辅助静态方法 ---
320
350
 
321
351
  @staticmethod
322
- def parse_packet_data(packet: dict) -> Tuple[int, int, float, str, str]:
323
- frame_num = int(packet["frame.number"][0])
324
- request_in = int(packet["http.request_in"][0]) if packet.get("http.request_in") else frame_num
325
- full_uri = parse.unquote(packet["http.request.full_uri"][0]) if packet.get("http.request.full_uri") else ""
326
- time_epoch = float(packet["frame.time_epoch"][0])
327
-
328
- if packet.get("tcp.reassembled.data"):
329
- full_request = packet["tcp.reassembled.data"][0]
330
- elif packet.get("tcp.payload"):
331
- full_request = packet["tcp.payload"][0]
352
+ def parse_packet_data(row: list) -> Tuple[int, int, float, str, str]:
353
+ # row definition:
354
+ # 0: http.response.code
355
+ # 1: http.request_in
356
+ # 2: tcp.reassembled.data
357
+ # 3: frame.number
358
+ # 4: tcp.payload
359
+ # 5: frame.time_epoch
360
+ # 6: exported_pdu.exported_pdu
361
+ # 7: http.request.full_uri
362
+
363
+ frame_num = int(row[3])
364
+ request_in = int(row[1]) if row[1] else frame_num
365
+ full_uri = parse.unquote(row[7]) if row[7] else ""
366
+ time_epoch = float(row[5])
367
+
368
+ if row[2]:
369
+ full_request = row[2]
370
+ elif row[4]:
371
+ full_request = row[4]
332
372
  else:
333
- full_request = packet["exported_pdu.exported_pdu"][0] if packet.get("exported_pdu.exported_pdu") else ""
373
+ full_request = row[6] if row[6] else ""
374
+
334
375
  return frame_num, request_in, time_epoch, full_uri, full_request
335
376
 
336
377
  @staticmethod
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -15,14 +15,6 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Classifier: Programming Language :: Python :: 3.9
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Dynamic: author
19
- Dynamic: author-email
20
- Dynamic: classifier
21
- Dynamic: description
22
- Dynamic: description-content-type
23
- Dynamic: home-page
24
- Dynamic: license-file
25
- Dynamic: summary
26
18
 
27
19
  # FlowAnalyzer
28
20
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: FlowAnalyzer
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件
5
5
  Home-page: https://github.com/Byxs20/FlowAnalyzer
6
6
  Author: Byxs20
@@ -15,14 +15,6 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Classifier: Programming Language :: Python :: 3.9
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Dynamic: author
19
- Dynamic: author-email
20
- Dynamic: classifier
21
- Dynamic: description
22
- Dynamic: description-content-type
23
- Dynamic: home-page
24
- Dynamic: license-file
25
- Dynamic: summary
26
18
 
27
19
  # FlowAnalyzer
28
20
 
@@ -7,7 +7,7 @@ with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8"
7
7
 
8
8
  setup(
9
9
  name="FlowAnalyzer",
10
- version="0.4.2",
10
+ version="0.4.3",
11
11
  description="FlowAnalyzer是一个流量分析器,用于解析和处理tshark导出的JSON数据文件",
12
12
  author="Byxs20",
13
13
  author_email="97766819@qq.com",
File without changes
File without changes
File without changes