PyPI - FlowAnalyzer - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

FlowAnalyzer 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

FlowAnalyzer/FlowAnalyzer.py CHANGED Viewed

@@ -199,9 +199,7 @@ class FlowAnalyzer:
             "-e",
             "http.request.full_uri",  # 7
             "-e",
-            "http.file_data",  # 8
-            "-e",
-            "tcp.segment.count",  # 9
+            "tcp.segment.count",  # 8
             "-E",
             "header=n",
             "-E",

FlowAnalyzer/PacketParser.py CHANGED Viewed

@@ -9,7 +9,7 @@ from .logging_config import logger
 class PacketParser:
     @staticmethod
-    def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes, bytes]:
+    def parse_packet_data(row: list) -> Tuple[int, int, float, str, bytes]:
         """
         解析 Tshark 输出的一行数据
         row definition (all bytes):
@@ -21,18 +21,17 @@ class PacketParser:
         5: frame.time_epoch
         6: exported_pdu.exported_pdu
         7: http.request.full_uri
-        8: http.file_data
-        9: tcp.segment.count
+        8: tcp.segment.count
         """
         frame_num = int(row[3])
         request_in = int(row[1]) if row[1] else frame_num
         # Decode only URI to string
         full_uri = parse.unquote(row[7].decode("utf-8", errors="replace")) if row[7] else ""
         time_epoch = float(row[5])
-        http_file_data = row[8] if len(row) > 8 else b""
         # Logic for Raw Packet (Header Source)
-        is_reassembled = len(row) > 9 and row[9]
+        # Previous index 9 is now 8 since we removed http.file_data
+        is_reassembled = len(row) > 8 and row[8]
         if is_reassembled and row[2]:
             full_request = row[2]
@@ -42,16 +41,18 @@ class PacketParser:
             # Fallback (e.g. Exported PDU)
             full_request = row[2] if row[2] else (row[6] if row[6] else b"")
-        return frame_num, request_in, time_epoch, full_uri, full_request, http_file_data
+        return frame_num, request_in, time_epoch, full_uri, full_request
     @staticmethod
     def split_http_headers(file_data: bytes) -> Tuple[bytes, bytes]:
         headerEnd = file_data.find(b"\r\n\r\n")
         if headerEnd != -1:
             return file_data[: headerEnd + 4], file_data[headerEnd + 4 :]
-        elif file_data.find(b"\n\n") != -1:
-            headerEnd = file_data.index(b"\n\n") + 2
-            return file_data[:headerEnd], file_data[headerEnd:]
+        headerEnd = file_data.find(b"\n\n")
+        if headerEnd != -1:
+            return file_data[: headerEnd + 2], file_data[headerEnd + 2 :]
         return b"", file_data
     @staticmethod
@@ -67,20 +68,36 @@ class PacketParser:
         while cursor < total_len:
             newline_idx = file_data.find(b"\n", cursor)
             if newline_idx == -1:
+                # If no newline found, maybe it's just remaining data (though strictly should end with 0 chunk)
+                # But for robustness we might perform a "best effort" or just stop.
+                # raising ValueError("Not chunked data") might be too aggressive if we are just "trying" to dechunk
+                # Let's assume non-chunked if strict format not found
                 raise ValueError("Not chunked data")
             size_line = file_data[cursor:newline_idx].strip()
+            # Handle chunk extension: ignore everything after ';'
+            if b";" in size_line:
+                size_line = size_line.split(b";", 1)[0].strip()
             if not size_line:
                 cursor = newline_idx + 1
                 continue
-            chunk_size = int(size_line, 16)
+            try:
+                chunk_size = int(size_line, 16)
+            except ValueError:
+                raise ValueError("Invalid chunk size")
             if chunk_size == 0:
                 break
             data_start = newline_idx + 1
             data_end = data_start + chunk_size
+            # Robustness check
+            if data_start > total_len:
+                break
             if data_end > total_len:
                 chunks.append(file_data[data_start:])
                 break
@@ -88,51 +105,38 @@ class PacketParser:
             chunks.append(file_data[data_start:data_end])
             cursor = data_end
+            # Skip CRLF after chunk data
             while cursor < total_len and file_data[cursor] in (13, 10):
                 cursor += 1
         return b"".join(chunks)
     @staticmethod
-    def extract_http_file_data(full_request: bytes, http_file_data: bytes) -> Tuple[bytes, bytes]:
+    def extract_http_file_data(full_request: bytes) -> Tuple[bytes, bytes]:
         """
         提取HTTP请求或响应中的文件数据 (混合模式 - 二进制优化版)
         """
         header = b""
         file_data = b""
+        if not full_request:
+            return b"", b""
         try:
-            # --- 1. 提取 Header ---
-            if full_request:
-                raw_bytes = binascii.unhexlify(full_request)
-                h_part, _ = PacketParser.split_http_headers(raw_bytes)
-                header = h_part
-            # --- 2. 提取 Body ---
-            if http_file_data:
-                try:
-                    file_data = binascii.unhexlify(http_file_data)
-                    return header, file_data
-                except binascii.Error:
-                    logger.warning("解析 http.file_data Hex 失败，尝试回退到原始方式")
-            # --- 3. 回退模式 (Fallback) ---
-            if full_request and not file_data:
-                raw_bytes = binascii.unhexlify(full_request)
-                _, body_part = PacketParser.split_http_headers(raw_bytes)
-                with contextlib.suppress(Exception):
-                    body_part = PacketParser.dechunk_http_response(body_part)
-                with contextlib.suppress(Exception):
-                    if body_part.startswith(b"\x1f\x8b"):
-                        body_part = gzip.decompress(body_part)
-                file_data = body_part
+            raw_bytes = binascii.unhexlify(full_request)
+            header, body_part = PacketParser.split_http_headers(raw_bytes)
+            with contextlib.suppress(Exception):
+                body_part = PacketParser.dechunk_http_response(body_part)
+            with contextlib.suppress(Exception):
+                if body_part.startswith(b"\x1f\x8b"):
+                    body_part = gzip.decompress(body_part)
+            file_data = body_part
             return header, file_data
-        except ValueError as e:
-            logger.error(f"Hex转换失败: {str(e)[:100]}...")
+        except binascii.Error:
+            logger.error("Hex转换失败")
             return b"", b""
         except Exception as e:
             logger.error(f"解析HTTP数据未知错误: {e}")
@@ -149,12 +153,12 @@ class PacketParser:
         row = line.split(b"\t")
         try:
-            frame_num, request_in, time_epoch, full_uri, full_request, http_file_data = PacketParser.parse_packet_data(row)
+            frame_num, request_in, time_epoch, full_uri, full_request = PacketParser.parse_packet_data(row)
-            if not full_request and not http_file_data:
+            if not full_request:
                 return None
-            header, file_data = PacketParser.extract_http_file_data(full_request, http_file_data)
+            header, file_data = PacketParser.extract_http_file_data(full_request)
             # row[0] is http.response.code (bytes)
             is_response = bool(row[0])

FlowAnalyzer/PcapSplitter.py ADDED Viewed

@@ -0,0 +1,128 @@
+import os
+import time
+from collections import defaultdict
+from typing import List, Tuple
+import dpkt
+class PcapSplitter:
+    """
+    Encapsulates logic to split a PCAP/PCAPNG file into multiple smaller PCAP files
+    based on TCP flows, dynamically balanced for parallel processing.
+    """
+    def __init__(self, pcap_file: str, output_dir: str):
+        self.pcap_file = pcap_file
+        self.output_dir = output_dir
+    def get_stream_key(self, tcp, ip) -> Tuple:
+        """Generate a 5-tuple key for the flow."""
+        src = ip.src
+        dst = ip.dst
+        sport = tcp.sport
+        dport = tcp.dport
+        # Canonicalize bidirectional flows to the same key
+        key1 = (src, dst, sport, dport)
+        key2 = (dst, src, dport, sport)
+        return key1 if key1 < key2 else key2
+    def split(self, threshold_mb: int = 10, default_chunks: int = 3) -> List[str]:
+        """
+        Split the pcap file into balanced chunks based on stream volume (bytes).
+        Uses a Greedy Partition Algorithm (Longest Processing Time first).
+        Args:
+            threshold_mb: File size threshold in MB. If smaller, do not split.
+            default_chunks: Number of chunks to split into if threshold is exceeded.
+        Returns:
+            List of generated file paths (or original file if not split).
+        """
+        if not os.path.exists(self.pcap_file):
+            raise FileNotFoundError(f"PCAP file not found: {self.pcap_file}")
+        file_size_mb = os.path.getsize(self.pcap_file) / (1024 * 1024)
+        if file_size_mb < threshold_mb:
+            print(f"File size {file_size_mb:.2f}MB < {threshold_mb}MB. Skipping split.")
+            return [self.pcap_file]
+        os.makedirs(self.output_dir, exist_ok=True)
+        start_time = time.time()
+        # Dictionary to store packets: stream_key -> list of (ts, buf)
+        streams = defaultdict(list)
+        # Dictionary to store total size: stream_key -> total_bytes
+        stream_sizes = defaultdict(int)
+        # 1. Read and Group Packets
+        print(f"Reading {self.pcap_file}...")
+        with open(self.pcap_file, "rb") as f:
+            if self.pcap_file.lower().endswith(".pcapng"):
+                reader = dpkt.pcapng.Reader(f)
+            else:
+                reader = dpkt.pcap.Reader(f)
+            for ts, buf in reader:
+                try:
+                    eth = dpkt.ethernet.Ethernet(buf)
+                    if not isinstance(eth.data, dpkt.ip.IP):
+                        continue
+                    ip = eth.data
+                    if not isinstance(ip.data, dpkt.tcp.TCP):
+                        continue
+                    tcp = ip.data
+                    key = self.get_stream_key(tcp, ip)
+                    streams[key].append((ts, buf))
+                    stream_sizes[key] += len(buf)
+                except Exception:
+                    continue
+        total_streams = len(streams)
+        print(f"Found {total_streams} TCP streams.")
+        if total_streams == 0:
+            print("No TCP streams found to split.")
+            return []
+        # 2. Assign Streams to Buckets (Greedy LPT Algorithm)
+        num_chunks = min(default_chunks, total_streams)
+        # Sort streams by size (descending)
+        sorted_streams = sorted(stream_sizes.items(), key=lambda item: item[1], reverse=True)
+        # Buckets: list of (current_size, batch_index, list_of_keys)
+        # We perform standard list sort to find min bucket, sufficient for small N
+        buckets = [[0, i, []] for i in range(num_chunks)]
+        for key, size in sorted_streams:
+            # Find bucket with smallest current size
+            buckets.sort(key=lambda x: x[0])
+            smallest_bucket = buckets[0]
+            # Add stream to this bucket
+            smallest_bucket[0] += size
+            smallest_bucket[2].append(key)
+        print(f"Splitting into {num_chunks} files with volume balancing...")
+        generated_files = []
+        # 3. Write Batches
+        # Sort buckets by index ensures file naming order 0, 1, 2...
+        buckets.sort(key=lambda x: x[1])
+        for size, i, batch_keys in buckets:
+            out_file_path = os.path.join(self.output_dir, f"batch_{i}.pcap")
+            generated_files.append(out_file_path)
+            with open(out_file_path, "wb") as f:
+                writer = dpkt.pcap.Writer(f)
+                for key in batch_keys:
+                    for ts, buf in streams[key]:
+                        writer.writepkt(buf, ts)
+            print(f"  - Created {os.path.basename(out_file_path)}: {len(batch_keys)} streams ({size/1024/1024:.2f} MB)")
+        print(f"Split completed in {time.time() - start_time:.2f}s")
+        return generated_files

{flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: FlowAnalyzer
-Version: 0.4.4
+Version: 0.4.6
 Summary: FlowAnalyzer是一个流量分析器，用于解析和处理tshark导出的JSON数据文件
 Home-page: https://github.com/Byxs20/FlowAnalyzer
 Author: Byxs20

flowanalyzer-0.4.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+FlowAnalyzer/FlowAnalyzer.py,sha256=9SshWk5wf0XATI7W4eBiIpzEqeGFyQJs3on5ox-zrNQ,12666
+FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
+FlowAnalyzer/PacketParser.py,sha256=So3iD2ykkWpT0e3aLjBdx_ohoNscD-oAt4bfr_oRqgo,6331
+FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
+FlowAnalyzer/PcapSplitter.py,sha256=0E_vmLYYsE_gD34XTwG1XPx5kBg8ZchJspQEnkBoIdY,4855
+FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
+FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
+flowanalyzer-0.4.6.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
+flowanalyzer-0.4.6.dist-info/METADATA,sha256=j9Bw-2Sr1dx_DatRtxo56WE0BB1-WMOoIhfoSoSYk-Y,6099
+flowanalyzer-0.4.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+flowanalyzer-0.4.6.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
+flowanalyzer-0.4.6.dist-info/RECORD,,

flowanalyzer-0.4.4.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-FlowAnalyzer/FlowAnalyzer.py,sha256=GPXZeM1uiLmv_-UKtIwYlfYJ450Etpbtt4V2i_MpLhQ,12721
-FlowAnalyzer/Models.py,sha256=2x7nPJIAyLTC1oiGFlW4mELDPgthk2IsmuyearT-MSQ,622
-FlowAnalyzer/PacketParser.py,sha256=fGql84e-tu1PDsXh3NxctKaSh5YeYsJbh5ZCUe6Mo40,6329
-FlowAnalyzer/Path.py,sha256=E5VvucTftp8VTQUffFzFWHotQEYtZL-j7IQPOaleiug,130
-FlowAnalyzer/__init__.py,sha256=vfiHONPTrvjUU3MwhjFOEo3sWfzlhkA6gOLn_4UJ7sg,70
-FlowAnalyzer/logging_config.py,sha256=fnBlvoimteQ38IBlQBV9fdLQvfAlRgGhcvLpUC3YunA,732
-flowanalyzer-0.4.4.dist-info/licenses/LICENSE,sha256=ybAV0ECduYBZCpjkHyNALVWRRmT_eM0BDgqUszhwEFU,1080
-flowanalyzer-0.4.4.dist-info/METADATA,sha256=aUWcp8_ocQIgz0k_3IlhEHrkzsLHYo7XBPqEayIOGc0,6099
-flowanalyzer-0.4.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-flowanalyzer-0.4.4.dist-info/top_level.txt,sha256=2MtvAF6dEe_eHipw_6G5pFLb2uOCbGnlH0bC4iBtm5A,13
-flowanalyzer-0.4.4.dist-info/RECORD,,

{flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{flowanalyzer-0.4.4.dist-info → flowanalyzer-0.4.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

FlowAnalyzer 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

FlowAnalyzer 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl