flua 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flua-0.0.1/.gitignore ADDED
@@ -0,0 +1,208 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ notebooks/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py.cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # UV
99
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ #uv.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+ #poetry.toml
111
+
112
+ # pdm
113
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
115
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
116
+ #pdm.lock
117
+ #pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # pixi
122
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
123
+ #pixi.lock
124
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
125
+ # in the .venv directory. It is recommended not to include this directory in version control.
126
+ .pixi
127
+
128
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
129
+ __pypackages__/
130
+
131
+ # Celery stuff
132
+ celerybeat-schedule
133
+ celerybeat.pid
134
+
135
+ # SageMath parsed files
136
+ *.sage.py
137
+
138
+ # Environments
139
+ .env
140
+ .envrc
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ #.idea/
178
+
179
+ # Abstra
180
+ # Abstra is an AI-powered process automation framework.
181
+ # Ignore directories containing user credentials, local state, and settings.
182
+ # Learn more at https://abstra.io/docs
183
+ .abstra/
184
+
185
+ # Visual Studio Code
186
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
187
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
188
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
189
+ # you could uncomment the following to ignore the entire vscode folder
190
+ # .vscode/
191
+
192
+ # Ruff stuff:
193
+ .ruff_cache/
194
+
195
+ # PyPI configuration file
196
+ .pypirc
197
+
198
+ # Cursor
199
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
200
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
201
+ # refer to https://docs.cursor.com/context/ignore-files
202
+ .cursorignore
203
+ .cursorindexingignore
204
+
205
+ # Marimo
206
+ marimo/_static/
207
+ marimo/_lsp/
208
+ __marimo__/
@@ -0,0 +1,17 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.4.10
4
+ hooks:
5
+ - id: ruff-format
6
+ - id: ruff
7
+ args: [--fix, --exit-non-zero-on-fix]
8
+
9
+ - repo: local
10
+ hooks:
11
+ - id: pytest
12
+ name: pytest
13
+ entry: pytest --tb=short -q
14
+ language: system
15
+ types: [python]
16
+ pass_filenames: false
17
+ always_run: true
flua-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sangwook Kim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
flua-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,69 @@
1
+ Metadata-Version: 2.4
2
+ Name: flua
3
+ Version: 0.0.1
4
+ Summary: A library for structured flu sequence analysis
5
+ Project-URL: Homepage, https://github.com/ov3rfit/flua
6
+ Project-URL: Repository, https://github.com/ov3rfit/flua
7
+ Project-URL: Issues, https://github.com/ov3rfit/flua/issues
8
+ Author-email: Sangwook Kim <windaheadjp@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: biopython>=1.85
21
+ Requires-Dist: pandas>=2.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: ipykernel>=6.0; extra == 'dev'
24
+ Requires-Dist: pre-commit>=4.0; extra == 'dev'
25
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
26
+ Requires-Dist: pytest>=8.0; extra == 'dev'
27
+ Requires-Dist: ruff>=0.8; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # flua
31
+
32
+ > **Note:** This project is under active development. APIs may change without notice.
33
+
34
+ Influenza A sequence analysis toolkit.
35
+
36
+ ## (Current) Features
37
+
38
+ - Load and parse influenza A FASTA files
39
+ - Automatic sequence type detection (DNA / RNA / Protein)
40
+ - Subtype extraction from FASTA headers (e.g. H1N1, H5N1pdm09)
41
+ - Segment identification (PB2, PB1, PA, HA, NP, NA, MP, NS)
42
+ - Translation with alternative product generation (splicing, frameshift, alt-ORF)
43
+ - DataFrame export for multi-sample comparative analysis
44
+
45
+ ## Installation
46
+
47
+ ```bash
48
+ pip install -e ".[dev]"
49
+ ```
50
+
51
+ ## Usage
52
+
53
+ ```python
54
+ from flua import load_fasta, groups_to_dataframe, load_multiple_fasta
55
+
56
+ group = load_fasta("sample.fasta")
57
+ print(group.subtype) # e.g. "H1N1"
58
+
59
+ groups = load_multiple_fasta(["sample1.fasta", "sample2.fasta"])
60
+ df = groups_to_dataframe(groups, value_type="translated")
61
+ ```
62
+
63
+ ## Development
64
+
65
+ ```bash
66
+ pip install -e ".[dev]"
67
+ pytest
68
+ ruff check src/ tests/
69
+ ```
flua-0.0.1/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # flua
2
+
3
+ > **Note:** This project is under active development. APIs may change without notice.
4
+
5
+ Influenza A sequence analysis toolkit.
6
+
7
+ ## (Current) Features
8
+
9
+ - Load and parse influenza A FASTA files
10
+ - Automatic sequence type detection (DNA / RNA / Protein)
11
+ - Subtype extraction from FASTA headers (e.g. H1N1, H5N1pdm09)
12
+ - Segment identification (PB2, PB1, PA, HA, NP, NA, MP, NS)
13
+ - Translation with alternative product generation (splicing, frameshift, alt-ORF)
14
+ - DataFrame export for multi-sample comparative analysis
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install -e ".[dev]"
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ```python
25
+ from flua import load_fasta, groups_to_dataframe, load_multiple_fasta
26
+
27
+ group = load_fasta("sample.fasta")
28
+ print(group.subtype) # e.g. "H1N1"
29
+
30
+ groups = load_multiple_fasta(["sample1.fasta", "sample2.fasta"])
31
+ df = groups_to_dataframe(groups, value_type="translated")
32
+ ```
33
+
34
+ ## Development
35
+
36
+ ```bash
37
+ pip install -e ".[dev]"
38
+ pytest
39
+ ruff check src/ tests/
40
+ ```
@@ -0,0 +1,99 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+
6
+ # ---- Project Metadata ----
7
+
8
+ [project]
9
+ name = "flua"
10
+ version = "0.0.1"
11
+ description = "A library for structured flu sequence analysis"
12
+ readme = "README.md"
13
+ license = "MIT"
14
+ requires-python = ">=3.11"
15
+ authors = [
16
+ { name = "Sangwook Kim", email = "windaheadjp@gmail.com" },
17
+ ]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: OS Independent",
27
+ ]
28
+ dependencies = [
29
+ "pandas>=2.0",
30
+ "biopython>=1.85",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=8.0",
36
+ "pytest-cov>=5.0",
37
+ "ruff>=0.8",
38
+ "pre-commit>=4.0",
39
+ "ipykernel>=6.0", # Jupyter notebook support
40
+ ]
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/ov3rfit/flua"
44
+ Repository = "https://github.com/ov3rfit/flua"
45
+ Issues = "https://github.com/ov3rfit/flua/issues"
46
+
47
+
48
+ # ---- Hatch Build Settings ----
49
+
50
+ [tool.hatch.build.targets.sdist]
51
+ exclude = [
52
+ "notebooks/",
53
+ "tests/",
54
+ ".github/",
55
+ ]
56
+
57
+ [tool.hatch.build.targets.wheel]
58
+ packages = ["src/flua"]
59
+
60
+
61
+ # ---- Ruff ----
62
+
63
+ [tool.ruff]
64
+ target-version = "py310"
65
+ line-length = 88
66
+ src = ["src"]
67
+
68
+ [tool.ruff.lint]
69
+ select = [
70
+ "E", # pycodestyle errors
71
+ "W", # pycodestyle warnings
72
+ "F", # pyflakes
73
+ "I", # isort
74
+ "N", # pep8-naming
75
+ "UP", # pyupgrade
76
+ "B", # flake8-bugbear
77
+ "SIM", # flake8-simplify
78
+ ]
79
+
80
+ [tool.ruff.lint.isort]
81
+ known-first-party = ["flua"]
82
+
83
+
84
+ # ---- Pytest ----
85
+
86
+ [tool.pytest.ini_options]
87
+ testpaths = ["tests"]
88
+ pythonpath = ["src"]
89
+
90
+
91
+ # ---- Coverage ----
92
+
93
+ [tool.coverage.run]
94
+ source = ["flua"]
95
+ omit = ["tests/*", "notebooks/*"]
96
+
97
+ [tool.coverage.report]
98
+ show_missing = true
99
+ skip_empty = true
@@ -0,0 +1,33 @@
1
+ """flua – Influenza A FASTA sequence analysis toolkit."""
2
+
3
+ from flua.constants import ALTERNATIVE_PRODUCTS, INFLUENZA_SEGMENTS
4
+ from flua.display import print_group_summary
5
+ from flua.io import groups_to_dataframe, load_fasta, load_multiple_fasta
6
+ from flua.models import AnalyzedSequence, SequenceGroup
7
+ from flua.products import AlternativeProduct, generate_alternative_products
8
+ from flua.seq_utils import (
9
+ detect_sequence_type,
10
+ extract_subtype,
11
+ identify_segment,
12
+ translate_sequence,
13
+ )
14
+
15
+ __all__ = [
16
+ # Constants
17
+ "ALTERNATIVE_PRODUCTS",
18
+ "INFLUENZA_SEGMENTS",
19
+ # Models
20
+ "AlternativeProduct",
21
+ "AnalyzedSequence",
22
+ "SequenceGroup",
23
+ # Functions
24
+ "detect_sequence_type",
25
+ "extract_subtype",
26
+ "generate_alternative_products",
27
+ "groups_to_dataframe",
28
+ "identify_segment",
29
+ "load_fasta",
30
+ "load_multiple_fasta",
31
+ "print_group_summary",
32
+ "translate_sequence",
33
+ ]
@@ -0,0 +1,110 @@
1
+ """Constants and configuration for influenza A sequence analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ # Standard influenza A segment names (8 segments).
6
+ INFLUENZA_SEGMENTS = ["PB2", "PB1", "PA", "HA", "NP", "NA", "MP", "NS"]
7
+
8
+ # IUPAC amino acid characters that never appear in DNA/RNA sequences.
9
+ PROTEIN_ONLY_CHARS = set("FLIMSPHQEDKWRV")
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Influenza A alternative product definitions
13
+ # ---------------------------------------------------------------------------
14
+ # Mechanism types:
15
+ # "direct" – translate the full-length sequence in frame 1
16
+ # "splicing" – join exon1 + exon2, then translate
17
+ # "alt_orf" – scan a specified reading frame for the first ATG
18
+ # "frameshift" – ribosomal frameshift: N-terminal frame 0, then +1 frame
19
+ #
20
+ # Coordinates are 0-based and correspond to a canonical influenza A genome.
21
+ # Exact positions may vary slightly between strains.
22
+ # ---------------------------------------------------------------------------
23
+
24
+ ALTERNATIVE_PRODUCTS: dict[str, list[dict]] = {
25
+ "PB2": [
26
+ {
27
+ "name": "PB2",
28
+ "mechanism": "direct",
29
+ "description": "RNA-dependent RNA polymerase subunit PB2",
30
+ },
31
+ ],
32
+ "PB1": [
33
+ {
34
+ "name": "PB1",
35
+ "mechanism": "direct",
36
+ "description": "RNA-dependent RNA polymerase subunit PB1",
37
+ },
38
+ {
39
+ "name": "PB1-F2",
40
+ "mechanism": "alt_orf",
41
+ "description": "Pro-apoptotic mitochondrial protein from +1 ORF of PB1",
42
+ "scan_frame": 1,
43
+ "min_length_aa": 50,
44
+ },
45
+ ],
46
+ "PA": [
47
+ {
48
+ "name": "PA",
49
+ "mechanism": "direct",
50
+ "description": "RNA-dependent RNA polymerase subunit PA",
51
+ },
52
+ {
53
+ "name": "PA-X",
54
+ "mechanism": "frameshift",
55
+ "description": "Host shutoff protein via +1 ribosomal frameshift of PA",
56
+ "frameshift_nt": 573, # 191 codons * 3
57
+ "shift": 1,
58
+ "x_orf_length_aa": 61,
59
+ },
60
+ ],
61
+ "HA": [
62
+ {
63
+ "name": "HA",
64
+ "mechanism": "direct",
65
+ "description": "Hemagglutinin",
66
+ },
67
+ ],
68
+ "NP": [
69
+ {
70
+ "name": "NP",
71
+ "mechanism": "direct",
72
+ "description": "Nucleoprotein",
73
+ },
74
+ ],
75
+ "NA": [
76
+ {
77
+ "name": "NA",
78
+ "mechanism": "direct",
79
+ "description": "Neuraminidase",
80
+ },
81
+ ],
82
+ "MP": [
83
+ {
84
+ "name": "M1",
85
+ "mechanism": "direct",
86
+ "description": "Matrix protein 1 (unspliced colinear transcript)",
87
+ },
88
+ {
89
+ "name": "M2",
90
+ "mechanism": "splicing",
91
+ "description": "Ion channel protein (spliced from MP segment)",
92
+ "exon1_end": 51,
93
+ "exon2_start": 740,
94
+ },
95
+ ],
96
+ "NS": [
97
+ {
98
+ "name": "NS1",
99
+ "mechanism": "direct",
100
+ "description": "Non-structural protein 1 (unspliced colinear transcript)",
101
+ },
102
+ {
103
+ "name": "NEP",
104
+ "mechanism": "splicing",
105
+ "description": "Nuclear export protein / NS2 (spliced from NS segment)",
106
+ "exon1_end": 56,
107
+ "exon2_start": 529,
108
+ },
109
+ ],
110
+ }
@@ -0,0 +1,26 @@
1
+ """Human-readable display helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from flua.models import SequenceGroup
6
+
7
+
8
+ def print_group_summary(group: SequenceGroup) -> None:
9
+ """Print a concise summary of a :class:`SequenceGroup` to stdout."""
10
+ print(f"=== {group.group_name} (from: {group.source_file}) ===")
11
+ print(f" Subtype: {group.subtype or '(not detected)'}")
12
+ print(f" Total sequences: {len(group.sequences)}")
13
+ for seq in group.sequences:
14
+ seg_label = seq.segment_name or "(unknown)"
15
+ trans_len = len(seq.translated) if seq.translated else "-"
16
+ print(
17
+ f" [{seg_label}] {seq.id} | "
18
+ f"type={seq.seq_type} | "
19
+ f"length={seq.length} | "
20
+ f"translated_length={trans_len}"
21
+ )
22
+ for p in seq.alt_products:
23
+ print(
24
+ f" └─ {p.name} ({p.mechanism}): {p.length_aa} aa | {p.description}"
25
+ )
26
+ print()
@@ -0,0 +1,187 @@
1
+ """FASTA file I/O and DataFrame conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from collections import Counter
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ import pandas as pd
11
+ from Bio import SeqIO
12
+
13
+ from flua.constants import INFLUENZA_SEGMENTS
14
+ from flua.models import AnalyzedSequence, SequenceGroup
15
+ from flua.products import generate_alternative_products
16
+ from flua.seq_utils import (
17
+ detect_sequence_type,
18
+ extract_subtype,
19
+ identify_segment,
20
+ translate_sequence,
21
+ )
22
+
23
+ # ── Loading ──────────────────────────────────────────────────────────────
24
+
25
+
26
+ def load_fasta(
27
+ filepath: str | Path,
28
+ segment_names: list[str] | None = None,
29
+ group_name: str | None = None,
30
+ ) -> SequenceGroup:
31
+ """Read a single FASTA file and return a :class:`SequenceGroup`."""
32
+ filepath = Path(filepath)
33
+ if group_name is None:
34
+ group_name = filepath.stem
35
+
36
+ group = SequenceGroup(group_name=group_name, source_file=str(filepath))
37
+ detected_subtypes: list[str] = []
38
+
39
+ for record in SeqIO.parse(str(filepath), "fasta"):
40
+ seq_str = str(record.seq)
41
+ seq_type = detect_sequence_type(seq_str)
42
+ translated = translate_sequence(seq_str, seq_type)
43
+ segment = identify_segment(record.id, record.description, segment_names)
44
+
45
+ subtype = extract_subtype(record.id, record.description)
46
+ if subtype:
47
+ detected_subtypes.append(subtype)
48
+
49
+ alt_products: list = []
50
+ if seq_type != "Protein" and segment is not None:
51
+ alt_products = generate_alternative_products(seq_str, segment)
52
+
53
+ analyzed = AnalyzedSequence(
54
+ record=record,
55
+ seq_type=seq_type,
56
+ translated=translated,
57
+ segment_name=segment,
58
+ alt_products=alt_products,
59
+ )
60
+ group.sequences.append(analyzed)
61
+
62
+ # Assign the most frequently detected subtype to the group.
63
+ if detected_subtypes:
64
+ group.subtype = Counter(detected_subtypes).most_common(1)[0][0]
65
+
66
+ return group
67
+
68
+
69
+ def load_multiple_fasta(
70
+ filepaths: list[str | Path],
71
+ segment_names: list[str] | None = None,
72
+ ) -> list[SequenceGroup]:
73
+ """Read multiple FASTA files and return a list of
74
+ :class:`SequenceGroup` objects."""
75
+ return [load_fasta(fp, segment_names=segment_names) for fp in filepaths]
76
+
77
+
78
+ # ── DataFrame conversion ─────────────────────────────────────────────────
79
+
80
+
81
+ def groups_to_dataframe(
82
+ groups: list[SequenceGroup],
83
+ value_type: Literal["raw", "translated"] = "raw",
84
+ segment_names: list[str] | None = None,
85
+ include_alt_products: bool = True,
86
+ ) -> pd.DataFrame:
87
+ """Convert a list of :class:`SequenceGroup` objects into a
88
+ :class:`~pandas.DataFrame`.
89
+
90
+ Parameters
91
+ ----------
92
+ groups:
93
+ Sequence groups to convert.
94
+ value_type:
95
+ ``"raw"`` for nucleotide sequences, ``"translated"`` for amino
96
+ acid sequences.
97
+ segment_names:
98
+ Segment names to include as columns. Defaults to
99
+ :data:`~flua.constants.INFLUENZA_SEGMENTS`.
100
+ include_alt_products:
101
+ If ``True``, add columns for each alternative product.
102
+ """
103
+ if segment_names is None:
104
+ segment_names = INFLUENZA_SEGMENTS
105
+
106
+ # Collect alternative product names across all groups.
107
+ all_product_names: dict[str, set[str]] = {}
108
+ if include_alt_products:
109
+ for group in groups:
110
+ for seq in group.sequences:
111
+ if seq.segment_name:
112
+ if seq.segment_name not in all_product_names:
113
+ all_product_names[seq.segment_name] = set()
114
+ for p in seq.alt_products:
115
+ all_product_names[seq.segment_name].add(p.name)
116
+
117
+ rows = []
118
+ for group in groups:
119
+ row: dict = {
120
+ "group_name": group.group_name,
121
+ "source_file": group.source_file,
122
+ "subtype": group.subtype,
123
+ "num_sequences": len(group.sequences),
124
+ }
125
+
126
+ for seg_name in segment_names:
127
+ seq_obj = group.get_segment(seg_name)
128
+
129
+ if seq_obj is None:
130
+ row[f"{seg_name}_seq"] = None
131
+ row[f"{seg_name}_length"] = None
132
+ row[f"{seg_name}_type"] = None
133
+ else:
134
+ if value_type == "translated" and seq_obj.translated is not None:
135
+ row[f"{seg_name}_seq"] = seq_obj.translated
136
+ row[f"{seg_name}_length"] = len(seq_obj.translated)
137
+ else:
138
+ row[f"{seg_name}_seq"] = seq_obj.raw_sequence
139
+ row[f"{seg_name}_length"] = seq_obj.length
140
+ row[f"{seg_name}_type"] = seq_obj.seq_type
141
+
142
+ if include_alt_products and seg_name in all_product_names:
143
+ for prod_name in sorted(all_product_names[seg_name]):
144
+ col_key = f"{prod_name}_protein"
145
+ col_len_key = f"{prod_name}_length_aa"
146
+ if seq_obj is not None:
147
+ product = seq_obj.get_product(prod_name)
148
+ if product is not None:
149
+ row[col_key] = product.protein_seq
150
+ row[col_len_key] = product.length_aa
151
+ else:
152
+ row[col_key] = None
153
+ row[col_len_key] = None
154
+ else:
155
+ row[col_key] = None
156
+ row[col_len_key] = None
157
+
158
+ rows.append(row)
159
+
160
+ df = pd.DataFrame(rows)
161
+ _check_length_consistency(df, segment_names)
162
+ return df
163
+
164
+
165
+ def _check_length_consistency(df: pd.DataFrame, segment_names: list[str]) -> None:
166
+ """Emit a warning when sequence lengths for the same segment differ
167
+ across samples."""
168
+ if len(df) < 2:
169
+ return
170
+ for seg_name in segment_names:
171
+ length_col = f"{seg_name}_length"
172
+ if length_col not in df.columns:
173
+ continue
174
+ lengths = df[length_col].dropna()
175
+ if len(lengths) < 2:
176
+ continue
177
+ if len(lengths.unique()) > 1:
178
+ info = ", ".join(
179
+ f"{r['group_name']}={r[length_col]}"
180
+ for _, r in df.iterrows()
181
+ if pd.notna(r[length_col])
182
+ )
183
+ warnings.warn(
184
+ f"[{seg_name}] Sequence lengths differ across samples: {info}",
185
+ UserWarning,
186
+ stacklevel=3,
187
+ )
@@ -0,0 +1,74 @@
1
+ """Core data models for analyzed sequences and sequence groups."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+ from Bio.SeqRecord import SeqRecord
8
+
9
+ from flua.products import AlternativeProduct
10
+
11
+
12
+ @dataclass
13
+ class AnalyzedSequence:
14
+ """A single analyzed sequence (typically one influenza segment)."""
15
+
16
+ record: SeqRecord
17
+ seq_type: str
18
+ translated: str | None
19
+ segment_name: str | None
20
+ alt_products: list[AlternativeProduct] = field(default_factory=list)
21
+ length: int = 0
22
+
23
+ def __post_init__(self) -> None:
24
+ self.length = len(self.record.seq)
25
+
26
+ @property
27
+ def id(self) -> str:
28
+ return self.record.id
29
+
30
+ @property
31
+ def description(self) -> str:
32
+ return self.record.description
33
+
34
+ @property
35
+ def raw_sequence(self) -> str:
36
+ return str(self.record.seq)
37
+
38
+ def get_product(self, name: str) -> AlternativeProduct | None:
39
+ """Look up an alternative product by *name* (case-insensitive)."""
40
+ for p in self.alt_products:
41
+ if p.name.upper() == name.upper():
42
+ return p
43
+ return None
44
+
45
+
46
+ @dataclass
47
+ class SequenceGroup:
48
+ """A collection of sequences originating from a single FASTA file."""
49
+
50
+ group_name: str
51
+ source_file: str
52
+ subtype: str | None = None
53
+ sequences: list[AnalyzedSequence] = field(default_factory=list)
54
+
55
+ @property
56
+ def segment_names(self) -> list[str | None]:
57
+ return [s.segment_name for s in self.sequences]
58
+
59
+ def get_segment(self, name: str) -> AnalyzedSequence | None:
60
+ """Return the sequence whose segment name matches *name*
61
+ (case-insensitive)."""
62
+ for seq in self.sequences:
63
+ if seq.segment_name and seq.segment_name.upper() == name.upper():
64
+ return seq
65
+ return None
66
+
67
+ def get_all_products(self) -> list[tuple[str | None, AlternativeProduct]]:
68
+ """Return ``(segment_name, product)`` pairs for every alternative
69
+ product across all sequences in the group."""
70
+ results = []
71
+ for seq in self.sequences:
72
+ for p in seq.alt_products:
73
+ results.append((seq.segment_name, p))
74
+ return results
@@ -0,0 +1,166 @@
1
+ """Generation of influenza A alternative protein products.
2
+
3
+ Supports four mechanisms: direct translation, mRNA splicing, alternative
4
+ ORF scanning, and ribosomal frameshifting.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ from flua.constants import ALTERNATIVE_PRODUCTS
12
+ from flua.seq_utils import translate_frame1
13
+
14
+
15
+ @dataclass
16
+ class AlternativeProduct:
17
+ """A single alternative protein product derived from an influenza
18
+ segment."""
19
+
20
+ name: str
21
+ mechanism: str
22
+ description: str
23
+ nucleotide_seq: str
24
+ protein_seq: str
25
+ length_nt: int = 0
26
+ length_aa: int = 0
27
+
28
+ def __post_init__(self) -> None:
29
+ self.length_nt = len(self.nucleotide_seq)
30
+ self.length_aa = len(self.protein_seq)
31
+
32
+
33
+ # ── Per-mechanism generators ─────────────────────────────────────────────
34
+
35
+
36
+ def _generate_direct(seq: str, pdef: dict) -> AlternativeProduct | None:
37
+ """Primary protein: full-length frame-1 translation."""
38
+ protein = translate_frame1(seq)
39
+ return AlternativeProduct(
40
+ name=pdef["name"],
41
+ mechanism="direct",
42
+ description=pdef["description"],
43
+ nucleotide_seq=seq,
44
+ protein_seq=protein,
45
+ )
46
+
47
+
48
+ def _generate_spliced(seq: str, pdef: dict) -> AlternativeProduct | None:
49
+ """Spliced product: join exon 1 + exon 2 then translate."""
50
+ exon1_end = pdef["exon1_end"]
51
+ exon2_start = pdef["exon2_start"]
52
+
53
+ if len(seq) < exon2_start:
54
+ return None
55
+
56
+ spliced_nt = seq[:exon1_end] + seq[exon2_start:]
57
+ protein = translate_frame1(spliced_nt)
58
+
59
+ return AlternativeProduct(
60
+ name=pdef["name"],
61
+ mechanism="splicing",
62
+ description=pdef["description"],
63
+ nucleotide_seq=spliced_nt,
64
+ protein_seq=protein,
65
+ )
66
+
67
+
68
+ def _generate_alt_orf(seq: str, pdef: dict) -> AlternativeProduct | None:
69
+ """Alternative ORF: scan *scan_frame* for the first ATG and translate
70
+ to the first stop codon."""
71
+ scan_frame = pdef.get("scan_frame", 1)
72
+ min_length_aa = pdef.get("min_length_aa", 50)
73
+
74
+ for i in range(scan_frame, len(seq) - 2, 3):
75
+ codon = seq[i : i + 3].upper().replace("U", "T")
76
+ if codon == "ATG":
77
+ orf_seq = seq[i:]
78
+ protein = translate_frame1(orf_seq)
79
+ if "*" in protein:
80
+ protein = protein[: protein.index("*")]
81
+ if len(protein) >= min_length_aa:
82
+ orf_nt = seq[i : i + (len(protein) + 1) * 3]
83
+ return AlternativeProduct(
84
+ name=pdef["name"],
85
+ mechanism="alt_orf",
86
+ description=pdef["description"],
87
+ nucleotide_seq=orf_nt,
88
+ protein_seq=protein,
89
+ )
90
+ return None
91
+
92
+
93
+ def _generate_frameshift(seq: str, pdef: dict) -> AlternativeProduct | None:
94
+ """Ribosomal frameshift: N-terminal domain (frame 0) fused with
95
+ C-terminal domain (+*shift* frame)."""
96
+ fs_nt = pdef["frameshift_nt"]
97
+ shift = pdef.get("shift", 1)
98
+ x_orf_aa_len = pdef.get("x_orf_length_aa", 61)
99
+
100
+ if len(seq) < fs_nt + 10:
101
+ return None
102
+
103
+ n_term_protein = translate_frame1(seq[:fs_nt])
104
+
105
+ c_term_start = fs_nt + shift
106
+ c_term_full = translate_frame1(seq[c_term_start:])
107
+ if "*" in c_term_full:
108
+ c_term_full = c_term_full[: c_term_full.index("*")]
109
+ c_term_protein = c_term_full[:x_orf_aa_len]
110
+
111
+ fusion_protein = n_term_protein + c_term_protein
112
+ fusion_nt = seq[: c_term_start + len(c_term_protein) * 3]
113
+
114
+ return AlternativeProduct(
115
+ name=pdef["name"],
116
+ mechanism="frameshift",
117
+ description=pdef["description"],
118
+ nucleotide_seq=fusion_nt,
119
+ protein_seq=fusion_protein,
120
+ )
121
+
122
+
123
+ _GENERATORS = {
124
+ "direct": _generate_direct,
125
+ "splicing": _generate_spliced,
126
+ "alt_orf": _generate_alt_orf,
127
+ "frameshift": _generate_frameshift,
128
+ }
129
+
130
+
131
+ # ── Public API ───────────────────────────────────────────────────────────
132
+
133
+
134
+ def generate_alternative_products(
135
+ sequence: str,
136
+ segment_name: str,
137
+ product_defs: dict[str, list[dict]] | None = None,
138
+ ) -> list[AlternativeProduct]:
139
+ """Generate all alternative products for a given segment sequence.
140
+
141
+ Parameters
142
+ ----------
143
+ sequence:
144
+ The nucleotide sequence of the segment.
145
+ segment_name:
146
+ One of the standard influenza A segment names (e.g. ``"PA"``).
147
+ product_defs:
148
+ Custom product definition table. Defaults to
149
+ :data:`~flua.constants.ALTERNATIVE_PRODUCTS`.
150
+ """
151
+ if product_defs is None:
152
+ product_defs = ALTERNATIVE_PRODUCTS
153
+
154
+ defs = product_defs.get(segment_name, [])
155
+ if not defs:
156
+ defs = [{"name": segment_name, "mechanism": "direct", "description": ""}]
157
+
158
+ products = []
159
+ for pdef in defs:
160
+ generator = _GENERATORS.get(pdef["mechanism"])
161
+ if generator is None:
162
+ continue
163
+ product = generator(sequence, pdef)
164
+ if product is not None:
165
+ products.append(product)
166
+ return products
@@ -0,0 +1,140 @@
1
+ """Low-level sequence utilities: type detection, translation, subtype
2
+ extraction, and segment identification."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import re
7
+ from typing import Literal
8
+
9
+ from Bio.Seq import Seq
10
+
11
+ from flua.constants import INFLUENZA_SEGMENTS, PROTEIN_ONLY_CHARS
12
+
13
+ # ── Sequence type detection ──────────────────────────────────────────────
14
+
15
+
16
+ def detect_sequence_type(sequence: str) -> Literal["DNA", "RNA", "Protein"]:
17
+ """Classify *sequence* as DNA, RNA, or Protein based on its character
18
+ composition."""
19
+ seq_upper = sequence.upper().replace("-", "").replace(".", "")
20
+ unique_chars = set(seq_upper)
21
+
22
+ if unique_chars & PROTEIN_ONLY_CHARS:
23
+ return "Protein"
24
+ if "U" in unique_chars and "T" not in unique_chars:
25
+ return "RNA"
26
+ return "DNA"
27
+
28
+
29
+ # ── Translation ──────────────────────────────────────────────────────────
30
+
31
+
32
+ def translate_frame1(sequence: str) -> str:
33
+ """Translate *sequence* in reading frame 1 (offset 0).
34
+
35
+ Trailing nucleotides that do not form a complete codon are discarded.
36
+ """
37
+ seq_obj = Seq(sequence)
38
+ trimmed = seq_obj[: len(seq_obj) - len(seq_obj) % 3]
39
+ if len(trimmed) == 0:
40
+ return ""
41
+ return str(trimmed.translate())
42
+
43
+
44
+ def translate_sequence(sequence: str, seq_type: str) -> str | None:
45
+ """Return the frame-1 translation of *sequence*, or ``None`` if it is
46
+ already a protein."""
47
+ if seq_type == "Protein":
48
+ return None
49
+ return translate_frame1(sequence)
50
+
51
+
52
+ # ── Subtype extraction ───────────────────────────────────────────────────
53
+
54
+ # H<digits>N<digits> with optional parentheses and pdm suffix.
55
+ _SUBTYPE_PATTERN = re.compile(
56
+ r"[\(\[\s|_/]?"
57
+ r"(H\d{1,2}N\d{1,2})"
58
+ r"[\)\]\s|_/]?"
59
+ r"(pdm\d{0,4})?"
60
+ r"[\)\]\s|_/]?",
61
+ re.IGNORECASE,
62
+ )
63
+
64
+ # Separate H and N tokens (e.g. "H5 subtype N6").
65
+ _H_PATTERN = re.compile(r"(?<![A-Z0-9])(H\d{1,2})(?![A-Z0-9])", re.IGNORECASE)
66
+ _N_PATTERN = re.compile(r"(?<![A-Z0-9])(N\d{1,2})(?![A-Z0-9])", re.IGNORECASE)
67
+
68
+
69
+ def extract_subtype(header_id: str, description: str) -> str | None:
70
+ """Extract an influenza subtype string from a FASTA header.
71
+
72
+ Supported formats include ``A/California/07/2009(H1N1)``,
73
+ ``H5N1``, ``H1N1pdm09``, ``H3N2|segment 3``, and split
74
+ H/N notation such as ``"H5 subtype N6"``.
75
+
76
+ Returns
77
+ -------
78
+ str | None
79
+ The extracted subtype (e.g. ``"H1N1"``, ``"H5N1pdm09"``), or
80
+ ``None`` when no subtype is found.
81
+ """
82
+ combined = f"{header_id} {description}"
83
+
84
+ match = _SUBTYPE_PATTERN.search(combined)
85
+ if match:
86
+ subtype = match.group(1).upper()
87
+ pdm_suffix = match.group(2)
88
+ if pdm_suffix:
89
+ subtype += pdm_suffix.lower()
90
+ return subtype
91
+
92
+ h_match = _H_PATTERN.search(combined)
93
+ n_match = _N_PATTERN.search(combined)
94
+ if h_match and n_match:
95
+ return h_match.group(1).upper() + n_match.group(1).upper()
96
+
97
+ return None
98
+
99
+
100
+ # ── Segment identification ───────────────────────────────────────────────
101
+
102
+
103
+ def _build_segment_patterns(
104
+ segment_names: list[str],
105
+ ) -> list[tuple[str, re.Pattern]]:
106
+ """Build compiled regex patterns for each segment name.
107
+
108
+ Longer names are matched first so that e.g. ``"PB1-F2"`` takes
109
+ priority over ``"PB1"``. Word boundaries are defined by characters
110
+ that are *not* alphanumeric or hyphens, preventing ``"PA"`` from
111
+ matching inside ``"PA-X"``.
112
+ """
113
+ sorted_names = sorted(segment_names, key=len, reverse=True)
114
+ patterns = []
115
+ for name in sorted_names:
116
+ escaped = re.escape(name.upper())
117
+ pattern = re.compile(
118
+ r"(?<![A-Z0-9\-])" + escaped + r"(?![A-Z0-9\-])",
119
+ re.IGNORECASE,
120
+ )
121
+ patterns.append((name, pattern))
122
+ return patterns
123
+
124
+
125
+ def identify_segment(
126
+ header_id: str,
127
+ description: str,
128
+ segment_names: list[str] | None = None,
129
+ ) -> str | None:
130
+ """Identify the influenza segment name from a FASTA header."""
131
+ if segment_names is None:
132
+ segment_names = INFLUENZA_SEGMENTS
133
+
134
+ combined = f"{header_id} {description}"
135
+ patterns = _build_segment_patterns(segment_names)
136
+
137
+ for name, pattern in patterns:
138
+ if pattern.search(combined):
139
+ return name
140
+ return None