PyPI - FlowAnalyzer - Versions diffs - 0.4.4__tar.gz → 0.4.5__tar.gz - Mend

FlowAnalyzer 0.4.4tar.gz → 0.4.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{flowanalyzer-0.4.4 → flowanalyzer-0.4.5}/FlowAnalyzer/FlowAnalyzer.py RENAMED Viewed

@@ -199,9 +199,7 @@ class FlowAnalyzer:
             "-e",
             "http.request.full_uri",  # 7
             "-e",
-            "http.file_data",  # 8
-            "-e",
-            "tcp.segment.count",  # 9
+            "tcp.segment.count",  # 8
             "-E",
             "header=n",
             "-E",

{flowanalyzer-0.4.4 → flowanalyzer-0.4.5}/FlowAnalyzer/PacketParser.py RENAMED Viewed

@@ -9,7 +9,7 @@ from .logging_config import logger
 class PacketParser:
     @staticmethod
-    def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes, bytes]:
+    def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
         """
         解析 Tshark 输出的一行数据
         row definition (all bytes):
@@ -20,19 +20,18 @@ class PacketParser:
         4: tcp.payload
         5: frame.time_epoch
         6: exported_pdu.exported_pdu
-        7: http.request.full_uri
-        8: http.file_data
-        9: tcp.segment.count
+        7: http.request.full_uri
+        8: tcp.segment.count
         """
         frame_num = int(row[3])
         request_in = int(row[1]) if row[1] else frame_num
         # Decode only URI to string
         full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
         time_epoch = float(row[5])
-        http_file_data = row[8] if len(row) > 8 else b""
         # Logic for Raw Packet (Header Source)
-        is_reassembled = len(row) > 9 and row[9]
+        # Previous index 9 is now 8 since we removed http.file_data
+        is_reassembled = len(row) > 8 and row[8]
         if is_reassembled and row[2]:
             full_request = row[2]
@@ -42,7 +41,7 @@ class PacketParser:
             # Fallback (e.g. Exported PDU)
             full_request = row[2] if row[2] else (row[6] if row[6] else b"")
-        return frame_num, request_in, time_epoch, full_uri, full_request, http_file_data
+        return frame_num, request_in, time_epoch, full_uri, full_request
     @staticmethod
     def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
@@ -67,6 +66,10 @@ class PacketParser:
         while cursor < total_len:
             newline_idx = file_data.find(b"\n", cursor)
             if newline_idx == -1:
+                # If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
+                # But for robustness we might perform a "best effort" or just stop.
+                # raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
+                # Let's assume non-chunked if strict format not found
                 raise ValueError("Not chunked data")
             size_line = file_data[cursor:newline_idx].strip()
@@ -74,13 +77,21 @@ class PacketParser:
                 cursor = newline_idx + 1
                 continue
-            chunk_size = int(size_line, 16)
+            try:
+                chunk_size = int(size_line, 16)
+            except ValueError:
+                raise ValueError("Invalid chunk size")
             if chunk_size == 0:
                 break
             data_start = newline_idx + 1
             data_end = data_start + chunk_size
+            # Robustness check
+            if data_start > total_len:
+                break
             if data_end > total_len:
                 chunks.append(file_data[data_start:])
                 break
@@ -88,51 +99,38 @@ class PacketParser:
             chunks.append(file_data[data_start:data_end])
             cursor = data_end
+            # Skip CRLF after chunk data
             while cursor < total_len and file_data[cursor] in (13, 10):
                 cursor += 1
         return b"".join(chunks)
     @staticmethod
-    def extract_http_file_data(full_request: bytes, http_file_data: bytes) -> Tuple[bytes, bytes]:
+    def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
         """
         提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
         """
         header = b""
         file_data = b""
+        if not full_request:
+            return b"", b""
         try:
-            # --- 1. 提取 Header ---
-            if full_request:
-                raw_bytes = binascii.unhexlify(full_request)
-                h_part, _ = PacketParser.split_http_headers(raw_bytes)
-                header = h_part
-            # --- 2. 提取 Body ---
-            if http_file_data:
-                try:
-                    file_data = binascii.unhexlify(http_file_data)
-                    return header, file_data
-                except binascii.Error:
-                    logger.warning("解析 http.file_data Hex 失败，尝试回退到原始方式")
-            # --- 3. 回退模式 (Fallback) ---
-            if full_request and not file_data:
-                raw_bytes = binascii.unhexlify(full_request)
-                _, body_part = PacketParser.split_http_headers(raw_bytes)
-                with contextlib.suppress(Exception):
-                    body_part = PacketParser.dechunk_http_response(body_part)
-                with contextlib.suppress(Exception):
-                    if body_part.startswith(b"\x1f\x8b"):
-                        body_part = gzip.decompress(body_part)
-                file_data = body_part
+            raw_bytes = binascii.unhexlify(full_request)
+            header, body_part = PacketParser.split_http_headers(raw_bytes)
+            with contextlib.suppress(Exception):
+                body_part = PacketParser.dechunk_http_response(body_part)
+            with contextlib.suppress(Exception):
+                if body_part.startswith(b"\x1f\x8b"):
+                    body_part = gzip.decompress(body_part)
+            file_data = body_part
             return header, file_data
-        except ValueError as e:
-            logger.error(f"Hex转换失败: {str(e)[:100]}...")
+        except binascii.Error:
+            logger.error("Hex转换失败")
             return b"", b""
         except Exception as e:
             logger.error(f"解析HTTP数据未知错误: {e}")
@@ -149,12 +147,12 @@ class PacketParser:
         row = line.split(b"\t")
         try:
-            frame_num, request_in, time_epoch, full_uri, full_request, http_file_data = PacketParser.parse_packet_data(row)
+            frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
-            if not full_request and not http_file_data:
+            if not full_request:
                 return None
-            header, file_data = PacketParser.extract_http_file_data(full_request, http_file_data)
+            header, file_data = PacketParser.extract_http_file_data(full_request)
             # row[0] is http.response.code (bytes)
             is_response = bool(row[0])

flowanalyzer-0.4.5/FlowAnalyzer/PcapSplitter.py ADDED Viewed

@@ -0,0 +1,128 @@
+import os
+import time
+from collections import defaultdict
+from typing import List, Tuple
+import dpkt
+class PcapSplitter:
+    """
+    Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
+    based on TCP flows, dynamically balanced for parallel processing.
+    """
+    def __init__(self, pcap_file: str, output_dir: str):
+        self.pcap_file = pcap_file
+        self.output_dir = output_dir
+    def get_stream_key(self, tcp, ip) -> Tuple:
+        """Generate a 5-tuple key for the flow."""
+        src = ip.src
+        dst = ip.dst
+        sport = tcp.sport
+        dport = tcp.dport
+        # Canonicalize bidirectional flows to the same key
+        key1 = (src, dst, sport, dport)
+        key2 = (dst, src, dport, sport)
+        return key1 if key1 < key2 else key2
+    def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
+        """
+        Split the pcap file into balanced chunks based on stream volume (bytes).
+        Uses a Greedy Partition Algorithm (Longest Processing Time first).
+        Args:
+            threshold_mb: File size threshold in MB. If smaller, do not split.
+            default_chunks: Number of chunks to split into if threshold is exceeded.
+        Returns:
+            List of generated file paths (or original file if not split).
+        """
+        if not os.path.exists(self.pcap_file):
+            raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
+        file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
+        if file_size_mb < threshold_mb:
+            print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
+            return [self.pcap_file]
+        os.makedirs(self.output_dir, exist_ok=True)
+        start_time = time.time()
+        # Dictionary to store packets: stream_key -> list of (ts, buf)
+        streams = defaultdict(list)
+        # Dictionary to store total size: stream_key -> total_bytes
+        stream_sizes = defaultdict(int)
+        # 1. Read and Group Packets
+        print(f"Reading {self.pcap_file}...")
+        with open(self.pcap_file, "rb") as f:
+            if self.pcap_file.lower().endswith(".pcapng"):
+                reader = dpkt.pcapng.Reader(f)
+            else:
+                reader = dpkt.pcap.Reader(f)
+            for ts, buf in reader:
+                try:
+                    eth = dpkt.ethernet.Ethernet(buf)
+                    if not isinstance(eth.data, dpkt.ip.IP):
+                        continue
+                    ip = eth.data
+                    if not isinstance(ip.data, dpkt.tcp.TCP):
+                        continue
+                    tcp = ip.data
+                    key = self.get_stream_key(tcp, ip)
+                    streams[key].append((ts, buf))
+                    stream_sizes[key] += len(buf)
+                except Exception:
+                    continue
+        total_streams = len(streams)
+        print(f"Found {total_streams} TCP streams.")
+        if total_streams == 0:
+            print("No TCP streams found to split.")
+            return []
+        # 2. Assign Streams to Buckets (Greedy LPT Algorithm)
+        num_chunks = min(default_chunks, total_streams)
+        # Sort streams by size (descending)
+        sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
+        # Buckets: list of (current_size, batch_index, list_of_keys)
+        # We perform standard list sort to find min bucket, sufficient for small N
+        buckets = [[0, i, []] for i in range(num_chunks)]
+        for key, size in sorted_streams:
+            # Find bucket with smallest current size
+            buckets.sort(key=lambda x: x[0])
+            smallest_bucket = buckets[0]
+            # Add stream to this bucket
+            smallest_bucket[0] += size
+            smallest_bucket[2].append(key)
+        print(f"Splitting into {num_chunks} files with volume balancing...")
+        generated_files = []
+        # 3. Write Batches
+        # Sort buckets by index ensures file naming order 0, 1, 2...
+        buckets.sort(key=lambda x: x[1])
+        for size, i, batch_keys in buckets:
+            out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
+            generated_files.append(out_file_path)
+            with open(out_file_path, "wb") as f:
+                writer = dpkt.pcap.Writer(f)
+                for key in batch_keys:
+                    for ts, buf in streams[key]:
+                        writer.writepkt(buf, ts)
+            print(f"  - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
+        print(f"Split completed in {time.time() - start_time:.2f}s")
+        return generated_files

{flowanalyzer-0.4.4 → flowanalyzer-0.4.5}/FlowAnalyzer.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: FlowAnalyzer
-Version: 0.4.4
+Version: 0.4.5
 Summary: FlowAnalyzer是一个流量分析器，用于解析和处理tshark导出的JSON数据文件
 Home-page: https://github.com/Byxs20/FlowAnalyzer
 Author: Byxs20

{flowanalyzer-0.4.4 → flowanalyzer-0.4.5}/FlowAnalyzer.egg-info/SOURCES.txt RENAMED Viewed

@@ -5,9 +5,13 @@ FlowAnalyzer/FlowAnalyzer.py
 FlowAnalyzer/Models.py
 FlowAnalyzer/PacketParser.py
 FlowAnalyzer/Path.py
+FlowAnalyzer/PcapSplitter.py
 FlowAnalyzer/__init__.py
 FlowAnalyzer/logging_config.py
 FlowAnalyzer.egg-info/PKG-INFO
 FlowAnalyzer.egg-info/SOURCES.txt
 FlowAnalyzer.egg-info/dependency_links.txt
-FlowAnalyzer.egg-info/top_level.txt
+FlowAnalyzer.egg-info/top_level.txt
+tests/test.py
+tests/test_parser.py
+tests/test_split.py

{flowanalyzer-0.4.4 → flowanalyzer-0.4.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: FlowAnalyzer
-Version: 0.4.4
+Version: 0.4.5
 Summary: FlowAnalyzer是一个流量分析器，用于解析和处理tshark导出的JSON数据文件
 Home-page: https://github.com/Byxs20/FlowAnalyzer
 Author: Byxs20

{flowanalyzer-0.4.4 → flowanalyzer-0.4.5}/setup.py RENAMED Viewed

@@ -7,7 +7,7 @@ with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8"
 setup(
     name="FlowAnalyzer",
-    version="0.4.4",
+    version="0.4.5",
     description="FlowAnalyzer是一个流量分析器，用于解析和处理tshark导出的JSON数据文件",
     author="Byxs20",
     author_email="97766819@qq.com",

flowanalyzer-0.4.5/tests/test.py ADDED Viewed

@@ -0,0 +1,48 @@
+import os
+from viztracer import VizTracer
+from FlowAnalyzer.FlowAnalyzer import FlowAnalyzer
+# ============================
+# 配置区域
+# ============================
+PCAP_FILE = "./tests/Beyond_Pro.pcapng"  # 你的测试 pcap 文件路径
+DISPLAY_FILTER = "http"  # tshark display filter, 可以根据需求改
+# ============================
+# 测试逻辑
+# ============================
+def main():
+    if not os.path.exists(PCAP_FILE):
+        print(f"[ERROR] 流量包不存在: {PCAP_FILE}")
+        return
+    print("[*] 开始解析 PCAP 文件...")
+    with VizTracer():
+        db_path = FlowAnalyzer.get_db_data(PCAP_FILE, DISPLAY_FILTER)
+    print(f"[*] 解析完成，数据库生成: {db_path}")
+    print("[*] 遍历 HTTP 请求-响应对:")
+    analyzer = FlowAnalyzer(db_path)
+    total = 0
+    requests_count = 0
+    responses_count = 0
+    for pair in analyzer.generate_http_dict_pairs():
+        total += 1
+        if pair.request:
+            requests_count += 1
+        if pair.response:
+            responses_count += 1
+    print(f"[*] 总记录数: {total}")
+    print(f"[*] 请求数量: {requests_count}")
+    print(f"[*] 响应数量: {responses_count}")
+    print("[*] 测试完成 ✅")
+if __name__ == "__main__":
+    main()

flowanalyzer-0.4.5/tests/test_parser.py ADDED Viewed

@@ -0,0 +1,47 @@
+import binascii
+import gzip
+import unittest
+from FlowAnalyzer.PacketParser import PacketParser
+class TestPacketParserOptimization(unittest.TestCase):
+    def test_gzip_decompression(self):
+        # Construct a fake HTTP response with GZIP body
+        content = b"Hello, Gzip World!"
+        compressed = gzip.compress(content)
+        header = b"HTTP/1.1 200 OK\r\nContent-Encoding: gzip\r\n\r\n"
+        full_response = header + compressed
+        full_request_hex = binascii.hexlify(full_response)
+        # Test extract_http_file_data
+        extracted_header, extracted_body = PacketParser.extract_http_file_data(full_request_hex)
+        self.assertEqual(extracted_header, header)
+        self.assertEqual(extracted_body, content)
+    def test_basic_extraction(self):
+        # Case: Simple text body, no chunking
+        content = b"Simple Body"
+        header = b"HTTP/1.1 200 OK\r\n\r\n"
+        full_response = header + content
+        full_request_hex = binascii.hexlify(full_response)
+        extracted_header, extracted_body = PacketParser.extract_http_file_data(full_request_hex)
+        self.assertEqual(extracted_body, content)
+    def test_chunked_decoding(self):
+        # Case: Chunked body
+        # 5\r\nHello\r\n0\r\n\r\n
+        chunked_body = b"5\r\nHello\r\n0\r\n\r\n"
+        header = b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n"
+        full_response = header + chunked_body
+        full_request_hex = binascii.hexlify(full_response)
+        extracted_header, extracted_body = PacketParser.extract_http_file_data(full_request_hex)
+        self.assertEqual(extracted_body, b"Hello")
+if __name__ == "__main__":
+    unittest.main()

flowanalyzer-0.4.5/tests/test_split.py ADDED Viewed

@@ -0,0 +1,90 @@
+import os
+import shutil
+import subprocess
+from FlowAnalyzer.PcapSplitter import PcapSplitter
+#############################
+# 配置区
+#############################
+PCAP_FILE = r"./tests/Beyond_Pro.pcapng"  # 修改为你的文件
+OUT_DIR = "output"
+#############################
+def clean_output_dir(directory: str):
+    if os.path.exists(directory):
+        print(f"Cleaning output directory: {directory}")
+        shutil.rmtree(directory)
+    os.makedirs(directory, exist_ok=True)
+def count_packets(pcap_path: str, display_filter: str) -> int:
+    cmd = [
+        "tshark",
+        "-r", pcap_path,
+        "-Y", display_filter,
+        "-T", "fields",
+        "-e", "frame.number"
+    ]
+    try:
+        # Run tshark and capture output
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        # Count non-empty lines
+        count = sum(1 for line in result.stdout.splitlines() if line.strip())
+        return count
+    except subprocess.CalledProcessError as e:
+        print(f"Error running tshark on {pcap_path}: {e}")
+        return 0
+    except FileNotFoundError:
+        print("Error: tshark not found in PATH.")
+        return 0
+def main():
+    print("Beginning split test...")
+    # 1. Clean output directory
+    clean_output_dir(OUT_DIR)
+    splitter = PcapSplitter(PCAP_FILE, OUT_DIR)
+    # Defaults to os.cpu_count() chunks
+    result_files = splitter.split()
+    print(f"\nGenerated {len(result_files)} files:")
+    for f in result_files:
+        print(f)
+    # 2. Verify with Tshark
+    print("\nVerifying data integrity with Tshark...")
+    total_requests = 0
+    total_responses = 0
+    EXPECTED_REQUESTS = 12284
+    EXPECTED_RESPONSES = 12281
+    for pcap in result_files:
+        req_count = count_packets(pcap, "http.request")
+        resp_count = count_packets(pcap, "http.response")
+        print(f"  {os.path.basename(pcap)}: Requests={req_count}, Responses={resp_count}")
+        total_requests += req_count
+        total_responses += resp_count
+    print("-" * 40)
+    print(f"Total Requests: {total_requests} (Expected: {EXPECTED_REQUESTS})")
+    print(f"Total Responses: {total_responses} (Expected: {EXPECTED_RESPONSES})")
+    if total_requests == EXPECTED_REQUESTS and total_responses == EXPECTED_RESPONSES:
+        print("\nSUCCESS: Data integrity verified.")
+    else:
+        print("\nFAILURE: Data integrity mismatch!")
+        exit(1)
+if __name__ == "__main__":
+    main()