PyPI - AdvancedAnalysisFileParser - Versions diffs - 0.1.0__tar.gz - Mend

AdvancedAnalysisFileParser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/AdvancedAnalysisConstants.py ADDED Viewed

@@ -0,0 +1,9 @@
+from typing import Final
+class AdvancedAnalysisConstants:
+    __slots__ = ()
+    OUTPUT_DIR : Final[str]  = "output_dir"
+    OUTPUT_JSON : Final[str]  = "output_json"
+    INPUT_DIR : Final[str]  = "input_dir"
+    MAP_FILES : Final[str]  = "map_files"
+    WARNING_KEY : Final[str]  = "Warning"
+    SEQUENCE_ID : Final[str]  = "sequence_id"

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/AdvancedAnalysisParser.py ADDED Viewed

@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import logging
+import os
+import re
+from typing import Optional
+from pathlib import Path
+from .Models import *
+from .Warnings import *
+from .Parsers import *
+from .AdvancedAnalysisConstants import AdvancedAnalysisConstants
+class AdvancedAnalysisParser:
+    """
+    Parse special callers outputs into unified JSON format.
+    The input files are specified in a configuration JSON file.
+    The output is a JSON file with the parsed data and warnings.
+    """
+    def __init__(self, json_request: JsonDict) -> None:
+        self.output_dir: str = json_request.get(AdvancedAnalysisConstants.OUTPUT_DIR, '.')
+        self.output_json: str = json_request.get(AdvancedAnalysisConstants.OUTPUT_JSON, 'adv_analysis_output.json')
+        self.input_dir: str = json_request.get(AdvancedAnalysisConstants.INPUT_DIR, '.')
+        self.input_files: list[str] = json_request.get('input_files', [])
+    def run(self, return_dict: bool = False) -> Optional[JsonDict]:
+        result = self.parse_files()
+        if return_dict:
+            return result
+        out_path = os.path.join(self.output_dir, self.output_json)
+        with open(out_path, 'w', encoding='utf-8') as f:
+            json.dump(result, f, indent=4)
+        logging.info(f"Wrote unified JSON to {out_path}")
+        return None
+    def parse_files(self) -> JsonDict:
+        logging.info("Started parsing Advanced Analysis files")
+        if not self.input_files or self.input_files == [] and self.input_dir and os.path.exists(self.input_dir):
+            folder_files = [
+                os.path.join(self.input_dir, f)
+                for f in os.listdir(self.input_dir) if os.path.isfile(os.path.join(self.input_dir, f))
+                and f.lower().endswith(('.json', '.tsv'))
+            ]
+            self.input_files.extend(folder_files)
+        if not self.input_files or self.input_files == []:
+            raise ValueError("No input files found in configuration or input directory.")
+        result : JsonDict = {}
+        for fname in self.input_files:
+            if not os.path.exists(fname):
+                fname_path = os.path.join(self.input_dir, fname)
+                if os.path.exists(fname_path):
+                    fname = fname_path
+                else:
+                    logging.warning(f"File {fname} not found. Skipping.")
+                    continue
+            parser: IAdvancedAnalysisFileParser = AdvancedAnalysisFileParserFactory.get_parser(fname)
+            file_parsed_data = self.parse_by_config(parser)
+            result.update(file_parsed_data)
+        return result
+    def parse_by_config(self,parser : IAdvancedAnalysisFileParser) -> JsonDict:
+        caller_data: JsonDict = parser.parse()
+        return parser.build_result(caller_data)
+    @staticmethod
+    def _threshold_warning(w: JsonDict, parsed: JsonDict, value_key: str, threshold_key: str, label: str) -> str:
+        name = w.get("caller_name")
+        key = w.get(value_key)
+        threshold = w.get(threshold_key)
+        val = parsed.get(key or "")
+        if isinstance(val, (int, float)) and threshold is not None and val >= threshold:
+            return f"Based on {name}, sample {label} {val} ≥ {threshold}"
+        return ""
+    @staticmethod
+    def parse_args() -> argparse.Namespace:
+        parser = argparse.ArgumentParser(
+            description="Parse special callers outputs into unified JSON"
+        )
+        parser.add_argument("-c", "--config", required=False, help="Path to config JSON file")
+        parser.add_argument("--input-dir", help="Input directory")
+        parser.add_argument("--output-dir", help="Output directory")
+        parser.add_argument("--input-files", nargs='+', help="Input file(s)")
+        parser.add_argument("--output-json", help="Output JSON filename")
+        return parser.parse_args()
+    @classmethod
+    def from_cli(cls) -> None:
+        args = cls.parse_args()
+        config = {}
+        # Load config file if provided
+        if args.config:
+            with open(args.config, 'r') as f:
+                config = json.load(f)
+        # Override config with CLI args if provided
+        if args.input_dir:
+            config[AdvancedAnalysisConstants.INPUT_DIR] = args.input_dir
+        if args.output_dir:
+            config[AdvancedAnalysisConstants.OUTPUT_DIR] = args.output_dir
+        if args.input_files:
+            # If input files are provided, build map_files structure and set input_files
+            map_files = {}
+            for fname in args.input_files:
+                map_files[fname] = config.get('map_files', {}).get(fname, {})
+            config[AdvancedAnalysisConstants.MAP_FILES] = map_files
+            config['input_files'] = args.input_files
+        if args.output_json:
+            config[AdvancedAnalysisConstants.OUTPUT_JSON] = args.output_json
+        parser = cls(config)
+        parser.run()
+if __name__ == "__main__":
+    import sys
+    from argparse import ArgumentParser
+    # Parse output-dir early to set up file logging
+    parser = ArgumentParser(add_help=False)
+    parser.add_argument("--output-dir", default=".")
+    args, _ = parser.parse_known_args()
+    log_path = os.path.join(args.output_dir, "advanced_analysis.log")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s: %(message)s",
+        handlers=[
+            logging.StreamHandler(sys.stdout),
+            logging.FileHandler(log_path, mode="a", encoding="utf-8")
+        ]
+    )
+    logging.info(f"Logging to {log_path}")
+    AdvancedAnalysisParser.from_cli()

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Models/ConditionOperator.py ADDED Viewed

@@ -0,0 +1,9 @@
+from enum import Enum
+class ConditionOperator(Enum):
+    EQ         = "=="   # equality
+    NE         = "!="   # not equal
+    GT         = ">"    # greater than
+    LT         = "<"    # less than
+    GE         = ">="   # ≥
+    LE         = "<="   # ≤
+    CONTAINS   = "in"   # substring or membership

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Models/FieldCondition.py ADDED Viewed

@@ -0,0 +1,29 @@
+from typing import Any, Callable,Dict,Optional
+from .ConditionOperator import ConditionOperator
+class FieldCondition:
+    def __init__(self, operator: ConditionOperator, value: Any, message: str, field: Optional[str] = None):
+        self.operator = operator
+        self.value = value
+        self.message = message
+    def check(self, value: Any) -> bool:
+        func = OPERATOR_FUNCS[self.operator]
+        return func(value, self.value)
+    def __str__(self) -> str:
+        return self.message
+# map each operator to a function that compares (field_value, target_value) → bool
+OPERATOR_FUNCS: Dict[ConditionOperator, Callable[[Any, Any], bool]] = {
+    ConditionOperator.EQ:       lambda a, b: a == b,
+    ConditionOperator.NE:       lambda a, b: a != b,
+    ConditionOperator.GT:       lambda a, b: isinstance(a, (int, float)) and a > b,
+    ConditionOperator.LT:       lambda a, b: isinstance(a, (int, float)) and a < b,
+    ConditionOperator.GE:       lambda a, b: isinstance(a, (int, float)) and a >= b,
+    ConditionOperator.LE:       lambda a, b: isinstance(a, (int, float)) and a <= b,
+    ConditionOperator.CONTAINS: lambda a, b: (isinstance(a, str) and b in a)
+                                or (isinstance(a, (list, set)) and b in a),
+}

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Models/FieldWarningConfig.py ADDED Viewed

@@ -0,0 +1,4 @@
+from typing import Optional
+from .FieldCondition import FieldCondition
+class FieldWarningConfig:
+    warning: Optional[FieldCondition] = None

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Models/JsonDict.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from typing import Any, Dict
2	+ JsonDict = Dict[str, Any]

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Models/SectionConfig.py ADDED Viewed

@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+from typing import Optional,Dict
+from .FieldCondition import FieldCondition
+@dataclass
+class SectionConfig:
+    include_all_fields: bool = False
+    fields: Optional[Dict[str, FieldCondition]] = None

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Models/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .JsonDict import JsonDict
+from .FieldCondition import FieldCondition
+from .ConditionOperator import ConditionOperator
+from .FieldWarningConfig import FieldWarningConfig
+from .SectionConfig import SectionConfig
+__all__ = [
+    "JsonDict",
+    "ConditionOperator",
+    "SectionConfig",
+    "FieldCondition",
+    "FieldWarningConfig"
+]

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Parsers/AdvancedAnalysisFileParserFactory.py ADDED Viewed

@@ -0,0 +1,36 @@
+from .IAdvancedAnalysisFileParser import IAdvancedAnalysisFileParser
+from .DragenTruSightOncology500TSVParser import DragenTruSightOncology500TSVParser
+from .OneLineTsvParser import OneLineTsvParser
+from .JsonSectionParser import JsonSectionParser
+from ..Models import JsonDict
+class AdvancedAnalysisFileParserFactory:
+    """
+    Returns the correct IParser implementation based on file path.
+    """
+    @staticmethod
+    def get_parser( file_path: str) -> IAdvancedAnalysisFileParser:
+        import importlib.resources
+        import json
+        lower = file_path.lower()
+        section_config = {}
+        try:
+            if lower.endswith(".tsv"):
+                if lower.endswith("combinedvariantoutput.tsv"):
+                    with importlib.resources.files('AdvancedAnalysisFileParser').joinpath('dragen_500_tsv_config.json').open('r', encoding='utf-8') as f:
+                        section_config = json.load(f)
+                    return DragenTruSightOncology500TSVParser(section_config, file_path)
+                if lower.endswith("smn.tsv"):
+                    with importlib.resources.files('AdvancedAnalysisFileParser').joinpath('smn_tsv_config.json').open('r', encoding='utf-8') as f:
+                        section_config = json.load(f)
+                    return OneLineTsvParser(section_config, file_path)
+                if lower.endswith("gba.tsv"):
+                    with importlib.resources.files('AdvancedAnalysisFileParser').joinpath('gba_tsv_config.json').open('r', encoding='utf-8') as f:
+                        section_config = json.load(f)
+                    return OneLineTsvParser(section_config, file_path)
+            if lower.endswith(".json"):
+                with importlib.resources.files('AdvancedAnalysisFileParser').joinpath('advConfig.json').open('r', encoding='utf-8') as f:
+                    section_config = json.load(f)
+                return JsonSectionParser(section_config, file_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load config for {file_path}: {e}")
+        raise ValueError(f"Unsupported file type: {file_path}")

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Parsers/DragenTruSightOncology500TSVParser.py ADDED Viewed

@@ -0,0 +1,102 @@
+import csv
+import logging
+from typing import Dict, Any, List, Optional, Union, Tuple
+from .IAdvancedAnalysisFileParser import IAdvancedAnalysisFileParser
+from ..Models import JsonDict, SectionConfig
+class DragenTruSightOncology500TSVParser(IAdvancedAnalysisFileParser):
+    """
+    Parser for CombinedVariantOutput TSV.
+    Reads only configured sections and fields, streaming parse inside loader.
+    """
+    def parse(self) -> JsonDict:
+        """
+        Entry point: prepare section configs and delegate to loader-parsing function.
+        """
+        sections = self._prepare_configs(self.section_config)
+        return self._load_and_parse_sections(sections)
+    def _load_and_parse_sections(self, sections: Dict[str, Union[SectionConfig, dict]]) -> JsonDict:
+        """
+        Open TSV, detect requested sections, parse key/value and table rows
+        on the fly according to each section's fields or include_all flag.
+        Returns a JsonDict mapping section names to extracted data.
+        """
+        result: JsonDict = {}
+        headers: Dict[str, List[str]] = {}
+        # Track for each section its cfg, fields, include_all, and accumulated list
+        state: Dict[str, Tuple[JsonDict, bool]] = {}
+        current_section: Optional[str] = None
+        with open(self.file_path, 'r', newline='', encoding='utf-8') as f:
+            reader = csv.reader(f, delimiter='\t')
+            for raw in reader:
+                if not raw or raw[0].startswith('#'):
+                    continue
+                row = [c.strip() for c in raw if c.strip()]
+                if not row:
+                    continue
+                # Section header
+                if row[0].startswith('[') and row[0].endswith(']'):
+                    sec = row[0].strip('[]')
+                    if sec in sections:
+                        current_section = sec
+                        cfg = sections[sec]
+                        fields, include_all = self._extract_fields(cfg)
+                        state[sec] = (fields, include_all)
+                        result[sec] = {}
+                        headers.pop(sec, None)
+                    else:
+                        current_section = None
+                    continue
+                if current_section is None:
+                    continue
+                fields, include_all = state[current_section]
+                # Before header: KV pairs or header row
+                if current_section not in headers:
+                    # KV pair
+                    if len(row) == 2:
+                        key, val = row
+                        key_l = key
+                        if include_all or key_l in fields:
+                            result[current_section][key_l] = IAdvancedAnalysisFileParser._format_value(val)
+                        continue
+                    # Header row
+                    hdr = [h for h in row]
+                    if not include_all and fields:
+                        allowed = set(f for f in fields)
+                        hdr = [h for h in hdr if h in allowed]
+                    headers[current_section] = hdr
+                    continue
+                # Table row
+                hdr = headers[current_section]
+                for i in range(min(len(hdr), len(row))):
+                    result[current_section].append(IAdvancedAnalysisFileParser._format_value(row[i]))
+        # Post-process: unwrap single KV-only sections
+        for sec, entries in list(result.items()):
+            hdr = headers.get(sec)
+            if hdr is None and len(entries) == 1:
+                result[sec] = entries
+        return result
+    def _prepare_configs(self, config: JsonDict) -> Dict[str, Union[SectionConfig, dict]]:
+        return {k: v for k, v in config.items()}
+    def _extract_fields(
+        self,
+        cfg: Union[SectionConfig, dict]
+    ) -> Tuple[JsonDict, bool]:
+        if isinstance(cfg, SectionConfig):
+            return cfg.fields or {}, cfg.include_all_fields
+        raw_fields = cfg.get('fields') or {}
+        include_all = cfg.get('include_all_fields', False)
+        raw_fields = {k: v for k, v in raw_fields.items()}
+        if isinstance(raw_fields, dict):
+            raw_fields = {k: None for k in raw_fields}
+        return raw_fields, include_all

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Parsers/IAdvancedAnalysisFileParser.py ADDED Viewed

@@ -0,0 +1,86 @@
+from abc import ABC, abstractmethod
+import os
+from pathlib import Path
+from typing import Any
+from ..Warnings import IWarning
+from AdvancedAnalysis.AdvancedAnalysisFileParser.AdvancedAnalysisConstants import AdvancedAnalysisConstants
+from AdvancedAnalysis.AdvancedAnalysisFileParser.Warnings.WarningFactory import WarningFactory
+from ..Models import JsonDict
+class IAdvancedAnalysisFileParser(ABC):
+    section_config : JsonDict
+    file_config : JsonDict
+    def __init__(self,section_config: JsonDict, file_path: str) -> None:
+        self.section_config = section_config
+        if not file_path:
+            raise ValueError("Filename cannot be empty")
+        if not isinstance(file_path, str):
+            raise TypeError(f"Filename must be a string, got {type(filename).__name__}")
+        if not file_path.lower().endswith(('.json', '.tsv')):
+            raise ValueError("Filename must be in lowercase")
+        #check if the file exists on path
+        if not os.path.exists(file_path):
+            base = Path(__file__).resolve().parent.parent
+            file_path = os.path.join(base, file_path)
+            if not os.path.exists(file_path):
+                raise FileNotFoundError(f"File not found: {file_path}")
+        self.file_path: str = file_path
+    @abstractmethod
+    def parse(self) -> JsonDict:
+        pass
+    def build_result(self,data: JsonDict) -> JsonDict:
+        result: JsonDict = {}
+        for caller_name, caller_config in self.section_config.items():
+            caller_title = caller_config.get("caller_name", None)
+            fields = caller_config.get("fields", None)
+            fields = {k: v for k, v in fields.items()} if isinstance(fields, dict) else fields
+            caller_data = data.get(caller_name, {})
+            if fields is None:
+                caller_warning = caller_config.get(AdvancedAnalysisConstants.WARNING_KEY, None)
+                if caller_warning:
+                    warning_formatter: IWarning = WarningFactory.get_formatter(caller_warning)
+                    warning_text = warning_formatter.format(caller_warning, caller_data)
+                    if result.get(caller_name.upper()) is None:
+                        result[caller_name.upper()] = {"data": {}, "warning": {}}
+                    result[caller_name.upper()]["caller_name"] = caller_title
+                    result[caller_name.upper()]["data"] = caller_data
+                    result[caller_name.upper()]["warning"] = warning_text
+                else:
+                    if result.get(caller_name.upper()) is None:
+                        result[caller_name.upper()] = {"data": {}}
+                    result[caller_name.upper()]["caller_name"] = caller_title
+                    result[caller_name.upper()]["data"] = caller_data
+        return result
+    @staticmethod
+    def _format_value(v: Any) -> Any:
+        if isinstance(v, dict):
+            return {k: IAdvancedAnalysisFileParser._format_value(v) for k, v in v.items()}
+        if isinstance(v, list):
+            clean = [IAdvancedAnalysisFileParser._format_value(x) for x in v if x not in ("", None)]
+            return clean or None
+        if isinstance(v, str):
+            low = v.lower()
+            if low == "true":
+                return True
+            if low == "false":
+                return False
+            if low in ("none", "null", ""):
+                return None
+            try:
+                return int(v)
+            except ValueError:
+                pass
+            try:
+                return round(float(v), 2)
+            except ValueError:
+                return v
+        if isinstance(v, float):
+            return round(v, 2)
+        #if isinstance(v, bool):
+        #if isinstance(v, int):
+        return v

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Parsers/JsonSectionParser.py ADDED Viewed

@@ -0,0 +1,39 @@
+import json
+from .IAdvancedAnalysisFileParser import IAdvancedAnalysisFileParser
+from ..Models import JsonDict
+class JsonSectionParser(IAdvancedAnalysisFileParser):
+    """
+    Parse JSON files with a specific section into a unified format.
+    The section is specified by the 'json_key' or 'caller_key' in the config.
+    """
+    def parse(self) -> JsonDict:
+        data = JsonSectionParser._load_json(self.file_path)
+        result = {}
+        import logging
+        # Build a mapping of lowercased data keys to actual keys
+        data_key_map = {k.lower(): k for k in data.keys()}
+        for key in self.section_config.keys():
+            caller_data = {}
+            key_lc = key.lower()
+            if key_lc in data_key_map:
+                actual_key = data_key_map[key_lc]
+                section = data.get(actual_key, {})
+                if section == {}:
+                    logging.warning(f"Section '{key}' is empty in the input data.")
+                for k, value in section.items():
+                    if k == 'variants':
+                        variant_list = []
+                        for index, var in enumerate(value):
+                            variant_list.append(IAdvancedAnalysisFileParser._format_value(var))
+                        caller_data[k] = variant_list
+                    else:
+                        caller_data[k] = IAdvancedAnalysisFileParser._format_value(value)
+            else:
+                logging.warning(f"Key '{key}' from config not found in input data. Skipping.")
+            result[key] = caller_data
+        return result
+    @staticmethod
+    def _load_json(path: str) -> JsonDict:
+        with open(path, 'r') as f:
+            return json.load(f)

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Parsers/OneLineTsvParser.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import Any
+from .IAdvancedAnalysisFileParser import IAdvancedAnalysisFileParser
+from ..Models import JsonDict
+# --- Parser Implementations ---
+class OneLineTsvParser(IAdvancedAnalysisFileParser):
+    is_tsv_parser = True
+    def parse(self) -> JsonDict:
+        with open(self.file_path, 'r') as f:
+            headers = f.readline().strip().split('\t')
+            values = f.readline().strip().split('\t')
+        parsed = {h: IAdvancedAnalysisFileParser._format_value(v) for h, v in zip(headers, values)}
+        # Find the caller name from config (should match the file type, e.g., 'GBA' or 'SMN1')
+        import os
+        caller_name = None
+        filename_base = os.path.splitext(os.path.basename(self.file_path))[0].lower()
+        for key in self.section_config.keys():
+            if filename_base == key.lower():
+                caller_name = key
+                break
+        if not caller_name:
+            caller_name = next(iter(self.section_config.keys()))
+        # Only return parsed data under caller name
+        return {caller_name: parsed}

advancedanalysisfileparser-0.1.0/AdvancedAnalysisFileParser/Parsers/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .IAdvancedAnalysisFileParser import IAdvancedAnalysisFileParser
+from .AdvancedAnalysisFileParserFactory import AdvancedAnalysisFileParserFactory
+__all__ = [
+    "IAdvancedAnalysisFileParser",
+    "AdvancedAnalysisFileParserFactory",
+]