filterframes 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- filterframes-0.2.0/.github/dependabot.yml +11 -0
- filterframes-0.2.0/.github/workflows/python-package.yml +44 -0
- filterframes-0.2.0/.github/workflows/python-publish.yml +28 -0
- filterframes-0.2.0/.gitignore +20 -0
- filterframes-0.2.0/CHANGELOG.md +26 -0
- {filterframes-0.1.3/src/filterframes.egg-info → filterframes-0.2.0}/PKG-INFO +18 -4
- filterframes-0.2.0/justfile +39 -0
- filterframes-0.2.0/pyproject.toml +68 -0
- filterframes-0.2.0/src/filterframes/__init__.py +15 -0
- {filterframes-0.1.3 → filterframes-0.2.0}/src/filterframes/filterframes.py +99 -76
- filterframes-0.2.0/src/filterframes/py.typed +0 -0
- {filterframes-0.1.3 → filterframes-0.2.0/src/filterframes.egg-info}/PKG-INFO +18 -4
- {filterframes-0.1.3 → filterframes-0.2.0}/src/filterframes.egg-info/SOURCES.txt +11 -2
- filterframes-0.2.0/src/filterframes.egg-info/requires.txt +7 -0
- filterframes-0.2.0/tests/data/DTASelect-filter_V2_1_12_paser.txt +176 -0
- filterframes-0.2.0/tests/data/DTASelect-filter_V2_1_13.txt +161 -0
- filterframes-0.2.0/tests/test_filterframes.py +128 -0
- filterframes-0.2.0/uv.lock +927 -0
- filterframes-0.1.3/pyproject.toml +0 -36
- filterframes-0.1.3/setup.py +0 -5
- filterframes-0.1.3/src/filterframes/__init__.py +0 -13
- filterframes-0.1.3/src/filterframes.egg-info/requires.txt +0 -1
- filterframes-0.1.3/tests/test_filterframes.py +0 -47
- {filterframes-0.1.3 → filterframes-0.2.0}/LICENSE +0 -0
- {filterframes-0.1.3 → filterframes-0.2.0}/README.md +0 -0
- {filterframes-0.1.3 → filterframes-0.2.0}/setup.cfg +0 -0
- {filterframes-0.1.3 → filterframes-0.2.0}/src/filterframes.egg-info/dependency_links.txt +0 -0
- {filterframes-0.1.3 → filterframes-0.2.0}/src/filterframes.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# To get started with Dependabot version updates, you'll need to specify which
|
|
2
|
+
# package ecosystems to update and where the package manifests are located.
|
|
3
|
+
# Please see the documentation for all configuration options:
|
|
4
|
+
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
|
5
|
+
|
|
6
|
+
version: 2
|
|
7
|
+
updates:
|
|
8
|
+
- package-ecosystem: "pip"
|
|
9
|
+
directory: "/"
|
|
10
|
+
schedule:
|
|
11
|
+
interval: "daily"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ "main" ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ "main" ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v6
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.13"
|
|
19
|
+
- name: Install dependencies
|
|
20
|
+
run: uv sync --extra dev
|
|
21
|
+
- name: Ruff lint
|
|
22
|
+
run: uvx ruff check src/ tests/
|
|
23
|
+
- name: Ruff format
|
|
24
|
+
run: uvx ruff format --check src/ tests/
|
|
25
|
+
- name: Type check with ty
|
|
26
|
+
run: uvx ty check src/
|
|
27
|
+
|
|
28
|
+
test:
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
strategy:
|
|
31
|
+
fail-fast: false
|
|
32
|
+
matrix:
|
|
33
|
+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
34
|
+
steps:
|
|
35
|
+
- uses: actions/checkout@v4
|
|
36
|
+
- name: Install uv
|
|
37
|
+
uses: astral-sh/setup-uv@v6
|
|
38
|
+
- uses: actions/setup-python@v5
|
|
39
|
+
with:
|
|
40
|
+
python-version: ${{ matrix.python-version }}
|
|
41
|
+
- name: Install dependencies
|
|
42
|
+
run: uv sync --extra dev
|
|
43
|
+
- name: Test with pytest
|
|
44
|
+
run: uv run pytest --cov=filterframes --cov-report=term-missing
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Upload Python Package
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
deploy:
|
|
12
|
+
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v6
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: '3.13'
|
|
22
|
+
- name: Build package
|
|
23
|
+
run: uv build
|
|
24
|
+
- name: Publish package
|
|
25
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
26
|
+
with:
|
|
27
|
+
user: __token__
|
|
28
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
*.egg-info/
|
|
8
|
+
*.egg
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
|
|
12
|
+
# Testing / coverage
|
|
13
|
+
.coverage
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
htmlcov/
|
|
16
|
+
|
|
17
|
+
# IDE
|
|
18
|
+
.idea/
|
|
19
|
+
.vscode/
|
|
20
|
+
*.swp
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [0.2.0]
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
- Minimum Python version raised from 3.8 to 3.9
|
|
9
|
+
- Added Python 3.12 and 3.13 to CI test matrix
|
|
10
|
+
- Updated GitHub Actions to v4/v5 for checkout and setup-python
|
|
11
|
+
- Improved exception chaining (`raise ... from exc`) for better tracebacks
|
|
12
|
+
- Renamed type alias from `FILE_TYPES` to `FileTypes` (PEP 8 compliance)
|
|
13
|
+
- Added `py.typed` marker for PEP 561 typed package support
|
|
14
|
+
- Added logging throughout the parsing pipeline
|
|
15
|
+
- Added input validation for malformed DTASelect-filter files
|
|
16
|
+
- Added `pytest-cov` for test coverage reporting in CI
|
|
17
|
+
- Expanded test suite with input type, error handling, and data integrity tests
|
|
18
|
+
- Modernized `pyproject.toml` with optional `[dev]` dependencies and tool configs
|
|
19
|
+
- Removed legacy `setup.py` (not needed with modern pip)
|
|
20
|
+
- Fixed potential crash when `end_lines` is empty
|
|
21
|
+
- Pinned minimum pandas version to `>=1.5`
|
|
22
|
+
|
|
23
|
+
## [0.1.3]
|
|
24
|
+
|
|
25
|
+
### Changed
|
|
26
|
+
- `_get_lines` now works with streamlit uploaded file, and any io-type
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: filterframes
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A very simple DTASelect-Filter.txt parser.
|
|
5
5
|
Author-email: Patrick Garrett <pgarrett@scripps.edu>
|
|
6
6
|
License: MIT License
|
|
@@ -25,15 +25,29 @@ License: MIT License
|
|
|
25
25
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
26
|
SOFTWARE.
|
|
27
27
|
|
|
28
|
-
Project-URL:
|
|
28
|
+
Project-URL: Repository, https://github.com/pgarrett-scripps/FilterFrames
|
|
29
|
+
Project-URL: Changelog, https://github.com/pgarrett-scripps/FilterFrames/blob/main/CHANGELOG.md
|
|
29
30
|
Keywords: IP2,PASER,Parser,Streamlit,DTASelect-filter,Peptide,Protein,Proteomics
|
|
30
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
31
37
|
Classifier: Development Status :: 4 - Beta
|
|
32
38
|
Classifier: License :: OSI Approved :: MIT License
|
|
33
39
|
Classifier: Operating System :: OS Independent
|
|
34
|
-
|
|
40
|
+
Classifier: Typing :: Typed
|
|
41
|
+
Requires-Python: >=3.9
|
|
35
42
|
Description-Content-Type: text/markdown
|
|
36
43
|
License-File: LICENSE
|
|
44
|
+
Requires-Dist: pandas>=1.5
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
47
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
48
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
49
|
+
Requires-Dist: ty; extra == "dev"
|
|
50
|
+
Dynamic: license-file
|
|
37
51
|
|
|
38
52
|

|
|
39
53
|

|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# List available recipes
|
|
2
|
+
default:
|
|
3
|
+
@just --list
|
|
4
|
+
|
|
5
|
+
# Install the package with dev dependencies
|
|
6
|
+
install:
|
|
7
|
+
uv sync --extra dev
|
|
8
|
+
|
|
9
|
+
# Run all checks (lint, format, typecheck, test)
|
|
10
|
+
check: lint format typecheck test
|
|
11
|
+
|
|
12
|
+
# Run ruff linter
|
|
13
|
+
lint:
|
|
14
|
+
uvx ruff check src/ tests/
|
|
15
|
+
|
|
16
|
+
# Check code formatting
|
|
17
|
+
format:
|
|
18
|
+
uvx ruff format --check src/ tests/
|
|
19
|
+
|
|
20
|
+
# Auto-fix lint issues and format code
|
|
21
|
+
fix:
|
|
22
|
+
uvx ruff check --fix src/ tests/
|
|
23
|
+
uvx ruff format src/ tests/
|
|
24
|
+
|
|
25
|
+
# Run ty type checker
|
|
26
|
+
typecheck:
|
|
27
|
+
uvx ty check src/
|
|
28
|
+
|
|
29
|
+
# Run tests
|
|
30
|
+
test *args:
|
|
31
|
+
uv run pytest {{ args }}
|
|
32
|
+
|
|
33
|
+
# Run tests with coverage
|
|
34
|
+
test-cov:
|
|
35
|
+
uv run pytest --cov=filterframes --cov-report=term-missing
|
|
36
|
+
|
|
37
|
+
# Build the package
|
|
38
|
+
build:
|
|
39
|
+
uv build
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "setuptools-scm"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "filterframes"
|
|
7
|
+
authors = [
|
|
8
|
+
{name = "Patrick Garrett", email = "pgarrett@scripps.edu"},
|
|
9
|
+
]
|
|
10
|
+
description = "A very simple DTASelect-Filter.txt parser."
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
requires-python = ">=3.9"
|
|
13
|
+
dynamic = ["version"]
|
|
14
|
+
license = {file = "LICENSE"}
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.9",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
|
+
"License :: OSI Approved :: MIT License",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
"Typing :: Typed",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"pandas>=1.5",
|
|
29
|
+
]
|
|
30
|
+
keywords = ["IP2", "PASER", "Parser", "Streamlit", "DTASelect-filter", "Peptide", "Protein", "Proteomics"]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7",
|
|
35
|
+
"pytest-cov>=4",
|
|
36
|
+
"ruff>=0.4",
|
|
37
|
+
"ty",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Repository = "https://github.com/pgarrett-scripps/FilterFrames"
|
|
42
|
+
Changelog = "https://github.com/pgarrett-scripps/FilterFrames/blob/main/CHANGELOG.md"
|
|
43
|
+
|
|
44
|
+
[tool.setuptools]
|
|
45
|
+
package-dir = {"" = "src"}
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.dynamic]
|
|
48
|
+
version = {attr = "filterframes.__version__"}
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
target-version = "py39"
|
|
55
|
+
line-length = 120
|
|
56
|
+
src = ["src"]
|
|
57
|
+
|
|
58
|
+
[tool.ruff.lint]
|
|
59
|
+
select = [
|
|
60
|
+
"E", # pycodestyle errors
|
|
61
|
+
"W", # pycodestyle warnings
|
|
62
|
+
"F", # pyflakes
|
|
63
|
+
"I", # isort
|
|
64
|
+
"UP", # pyupgrade
|
|
65
|
+
"B", # flake8-bugbear
|
|
66
|
+
"SIM", # flake8-simplify
|
|
67
|
+
"RUF", # ruff-specific rules
|
|
68
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""FilterFrames: A DTASelect-filter.txt parser built on pandas."""
|
|
2
|
+
|
|
3
|
+
from .filterframes import (
|
|
4
|
+
FileTypes,
|
|
5
|
+
from_dta_select_filter,
|
|
6
|
+
to_dta_select_filter,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"FileTypes",
|
|
11
|
+
"from_dta_select_filter",
|
|
12
|
+
"to_dta_select_filter",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
__version__ = "0.2.0"
|
|
@@ -1,15 +1,23 @@
|
|
|
1
|
-
"""Module providing
|
|
1
|
+
"""Module providing functions for converting between DTASelect-filter.txt files and pandas DataFrame objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
2
6
|
import os
|
|
7
|
+
from collections.abc import Generator
|
|
3
8
|
from enum import Enum
|
|
4
|
-
from io import
|
|
5
|
-
from typing import
|
|
9
|
+
from io import StringIO, TextIOWrapper
|
|
10
|
+
from typing import Any, TextIO, Union
|
|
6
11
|
|
|
7
12
|
import pandas as pd
|
|
8
13
|
|
|
9
|
-
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Union form required at runtime for Python 3.9 compatibility
|
|
17
|
+
FileTypes = Union[str, TextIOWrapper, StringIO, TextIO]
|
|
10
18
|
|
|
11
19
|
|
|
12
|
-
def _get_lines(file_input:
|
|
20
|
+
def _get_lines(file_input: FileTypes) -> Generator[str, None, None]:
|
|
13
21
|
"""
|
|
14
22
|
Retrieve lines from a file or string input.
|
|
15
23
|
|
|
@@ -25,31 +33,30 @@ def _get_lines(file_input: FILE_TYPES) -> Generator[str, None, None]:
|
|
|
25
33
|
Raises:
|
|
26
34
|
ValueError: If the input type is not supported.
|
|
27
35
|
"""
|
|
28
|
-
if isinstance(file_input, str):
|
|
36
|
+
if isinstance(file_input, str): # File path or string
|
|
29
37
|
if os.path.exists(file_input):
|
|
30
|
-
|
|
38
|
+
logger.debug("Reading from file path: %s", file_input)
|
|
39
|
+
with open(file=file_input, encoding="UTF-8") as file:
|
|
31
40
|
for line in file:
|
|
32
|
-
yield line.rstrip(
|
|
41
|
+
yield line.rstrip("\n")
|
|
33
42
|
else:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
43
|
+
logger.debug("Reading from raw string input")
|
|
44
|
+
for line in file_input.split("\n"):
|
|
45
|
+
yield line.rstrip("\n")
|
|
46
|
+
elif isinstance(file_input, (TextIOWrapper, StringIO)):
|
|
47
|
+
logger.debug("Reading from %s object", type(file_input).__name__)
|
|
37
48
|
file_input.seek(0)
|
|
38
49
|
for line in file_input:
|
|
39
|
-
yield line.rstrip(
|
|
40
|
-
elif isinstance(file_input, StringIO): # StringIO
|
|
41
|
-
file_input.seek(0)
|
|
42
|
-
for line in file_input.readlines():
|
|
43
|
-
yield line.rstrip('\n')
|
|
50
|
+
yield line.rstrip("\n")
|
|
44
51
|
else:
|
|
45
52
|
try:
|
|
46
53
|
for line in file_input:
|
|
47
|
-
yield line.decode(
|
|
48
|
-
except
|
|
49
|
-
raise ValueError(f
|
|
54
|
+
yield line.decode("UTF-8").rstrip("\n") # ty: ignore[unresolved-attribute]
|
|
55
|
+
except (AttributeError, TypeError, UnicodeDecodeError) as exc:
|
|
56
|
+
raise ValueError(f"Unsupported input type: {type(file_input)}") from exc
|
|
50
57
|
|
|
51
58
|
|
|
52
|
-
def _convert_to_best_datatype(values:
|
|
59
|
+
def _convert_to_best_datatype(values: list[Any]) -> list[Any]:
|
|
53
60
|
"""
|
|
54
61
|
Convert a list of values to the most suitable datatype.
|
|
55
62
|
|
|
@@ -64,11 +71,9 @@ def _convert_to_best_datatype(values: List[Any]):
|
|
|
64
71
|
Raises:
|
|
65
72
|
ValueError: If unable to convert values to any datatype.
|
|
66
73
|
"""
|
|
67
|
-
|
|
68
74
|
for datatype in [float, int, str]:
|
|
69
75
|
try:
|
|
70
|
-
|
|
71
|
-
return converted_values
|
|
76
|
+
return [datatype(value) for value in values]
|
|
72
77
|
except (ValueError, TypeError):
|
|
73
78
|
continue
|
|
74
79
|
raise ValueError("Unable to convert values to any datatype")
|
|
@@ -104,12 +109,12 @@ def _reorder_columns(dataframe: pd.DataFrame, column: str, new_position: int) ->
|
|
|
104
109
|
pd.DataFrame: A dataframe with reordered columns.
|
|
105
110
|
"""
|
|
106
111
|
|
|
107
|
-
columns = dataframe.columns.tolist()
|
|
112
|
+
columns: list[str] = dataframe.columns.tolist()
|
|
108
113
|
columns.insert(new_position, columns.pop(columns.index(column)))
|
|
109
|
-
return dataframe
|
|
114
|
+
return dataframe.reindex(columns=columns)
|
|
110
115
|
|
|
111
116
|
|
|
112
|
-
def _write_lines(file_output, lines):
|
|
117
|
+
def _write_lines(file_output: TextIOWrapper | StringIO, lines: list[str]) -> None:
|
|
113
118
|
"""
|
|
114
119
|
Write a list of lines to a given file output.
|
|
115
120
|
|
|
@@ -119,11 +124,12 @@ def _write_lines(file_output, lines):
|
|
|
119
124
|
"""
|
|
120
125
|
|
|
121
126
|
for line in lines:
|
|
122
|
-
file_output.write(line +
|
|
127
|
+
file_output.write(line + "\n")
|
|
123
128
|
|
|
124
129
|
|
|
125
|
-
def from_dta_select_filter(
|
|
126
|
-
|
|
130
|
+
def from_dta_select_filter(
|
|
131
|
+
file_input: str | TextIOWrapper | StringIO | TextIO,
|
|
132
|
+
) -> tuple[list[str], pd.DataFrame, pd.DataFrame, list[str]]:
|
|
127
133
|
"""
|
|
128
134
|
Process the given file and extract relevant information to create peptide and protein dataframes.
|
|
129
135
|
|
|
@@ -143,47 +149,49 @@ def from_dta_select_filter(file_input: Union[str, TextIOWrapper, StringIO, TextI
|
|
|
143
149
|
|
|
144
150
|
lines = _get_lines(file_input)
|
|
145
151
|
|
|
146
|
-
class
|
|
147
|
-
"""
|
|
148
|
-
Enum for specifying the different parts of the DTASelect-filter.txt file
|
|
149
|
-
"""
|
|
152
|
+
class _FileState(Enum):
|
|
150
153
|
HEADER = 1
|
|
151
154
|
DATA = 2
|
|
152
155
|
INFO = 3
|
|
153
156
|
|
|
154
|
-
file_state =
|
|
157
|
+
file_state = _FileState.HEADER
|
|
155
158
|
|
|
156
|
-
header_lines
|
|
157
|
-
|
|
158
|
-
|
|
159
|
+
header_lines: list[str] = []
|
|
160
|
+
end_lines: list[str] = []
|
|
161
|
+
peptide_data: dict[str, list[Any]] | None = None
|
|
162
|
+
protein_data: dict[str, list[Any]] | None = None
|
|
163
|
+
current_protein_grp = 0
|
|
164
|
+
peptide_line_cnt = 0
|
|
159
165
|
|
|
160
|
-
for
|
|
166
|
+
for line in lines:
|
|
161
167
|
line_elements = line.rstrip().split("\t")
|
|
162
168
|
|
|
163
|
-
if line.startswith(
|
|
169
|
+
if line.startswith("Locus"): # Protein Line Header
|
|
164
170
|
protein_data = {key: [] for key in line_elements}
|
|
165
|
-
protein_data[
|
|
171
|
+
protein_data["ProteinGroup"] = []
|
|
166
172
|
|
|
167
|
-
if line.startswith(
|
|
173
|
+
if line.startswith("Unique"): # Peptide Line Header
|
|
168
174
|
peptide_data = {key: [] for key in line_elements}
|
|
169
|
-
peptide_data[
|
|
175
|
+
peptide_data["ProteinGroup"] = []
|
|
170
176
|
|
|
171
177
|
header_lines.append(line)
|
|
172
|
-
file_state =
|
|
178
|
+
file_state = _FileState.DATA
|
|
173
179
|
continue
|
|
174
180
|
|
|
175
181
|
if len(line_elements) > 1 and line_elements[1] == "Proteins":
|
|
176
|
-
file_state =
|
|
182
|
+
file_state = _FileState.INFO
|
|
177
183
|
|
|
178
|
-
if file_state ==
|
|
184
|
+
if file_state == _FileState.HEADER:
|
|
179
185
|
header_lines.append(line)
|
|
180
186
|
|
|
181
|
-
if file_state ==
|
|
182
|
-
if
|
|
187
|
+
if file_state == _FileState.DATA:
|
|
188
|
+
if peptide_data is None or protein_data is None:
|
|
189
|
+
continue
|
|
183
190
|
|
|
191
|
+
if line_elements[0] == "" or "*" in line_elements[0] or line_elements[0].isnumeric():
|
|
184
192
|
for key, value in zip(peptide_data, line_elements):
|
|
185
193
|
peptide_data[key].append(value)
|
|
186
|
-
peptide_data[
|
|
194
|
+
peptide_data["ProteinGroup"].append(current_protein_grp)
|
|
187
195
|
|
|
188
196
|
peptide_line_cnt += 1
|
|
189
197
|
else:
|
|
@@ -193,11 +201,20 @@ def from_dta_select_filter(file_input: Union[str, TextIOWrapper, StringIO, TextI
|
|
|
193
201
|
|
|
194
202
|
for key, value in zip(protein_data, line_elements):
|
|
195
203
|
protein_data[key].append(value)
|
|
196
|
-
protein_data[
|
|
204
|
+
protein_data["ProteinGroup"].append(current_protein_grp)
|
|
197
205
|
|
|
198
|
-
if file_state ==
|
|
206
|
+
if file_state == _FileState.INFO:
|
|
199
207
|
end_lines.append(line)
|
|
200
208
|
|
|
209
|
+
if peptide_data is None or protein_data is None:
|
|
210
|
+
raise ValueError("Input does not appear to be a valid DTASelect-filter file: missing header columns")
|
|
211
|
+
|
|
212
|
+
logger.debug(
|
|
213
|
+
"Parsed %d peptide columns, %d protein columns",
|
|
214
|
+
len(peptide_data),
|
|
215
|
+
len(protein_data),
|
|
216
|
+
)
|
|
217
|
+
|
|
201
218
|
for k in peptide_data:
|
|
202
219
|
peptide_data[k] = _convert_to_best_datatype(peptide_data[k])
|
|
203
220
|
|
|
@@ -207,27 +224,33 @@ def from_dta_select_filter(file_input: Union[str, TextIOWrapper, StringIO, TextI
|
|
|
207
224
|
peptide_df = pd.DataFrame(peptide_data)
|
|
208
225
|
protein_df = pd.DataFrame(protein_data)
|
|
209
226
|
|
|
210
|
-
file_name_components = [fn.split(
|
|
211
|
-
peptide_df.drop([
|
|
227
|
+
file_name_components = [fn.split(".") for fn in peptide_df["FileName"]]
|
|
228
|
+
peptide_df.drop(["FileName"], axis=1, inplace=True)
|
|
212
229
|
|
|
213
|
-
peptide_df[
|
|
214
|
-
peptide_df[
|
|
230
|
+
peptide_df["FileName"] = _convert_to_best_datatype([comp[0] for comp in file_name_components])
|
|
231
|
+
peptide_df["FileName"] = peptide_df["FileName"].astype("category")
|
|
215
232
|
|
|
216
|
-
peptide_df[
|
|
217
|
-
peptide_df[
|
|
218
|
-
peptide_df[
|
|
233
|
+
peptide_df["LowScan"] = _convert_to_best_datatype([comp[1] for comp in file_name_components])
|
|
234
|
+
peptide_df["HighScan"] = _convert_to_best_datatype([comp[2] for comp in file_name_components])
|
|
235
|
+
peptide_df["Charge"] = _convert_to_best_datatype([comp[3] for comp in file_name_components])
|
|
219
236
|
|
|
220
237
|
peptide_df = peptide_df.convert_dtypes()
|
|
221
238
|
protein_df = protein_df.convert_dtypes()
|
|
222
239
|
|
|
223
|
-
if end_lines[-1] ==
|
|
240
|
+
if end_lines and end_lines[-1] == "":
|
|
224
241
|
end_lines = end_lines[:-1]
|
|
225
242
|
|
|
243
|
+
logger.info("Parsed %d proteins and %d peptides", len(protein_df), len(peptide_df))
|
|
244
|
+
|
|
226
245
|
return header_lines, peptide_df, protein_df, end_lines
|
|
227
246
|
|
|
228
247
|
|
|
229
|
-
def to_dta_select_filter(
|
|
230
|
-
|
|
248
|
+
def to_dta_select_filter(
|
|
249
|
+
header_lines: list[str],
|
|
250
|
+
peptide_df: pd.DataFrame,
|
|
251
|
+
protein_df: pd.DataFrame,
|
|
252
|
+
end_lines: list[str],
|
|
253
|
+
) -> StringIO:
|
|
231
254
|
"""
|
|
232
255
|
Convert the given header lines, peptide and protein dataframes, and end lines into a StringIO object.
|
|
233
256
|
|
|
@@ -254,40 +277,40 @@ def to_dta_select_filter(header_lines: List[str], peptide_df: pd.DataFrame, prot
|
|
|
254
277
|
|
|
255
278
|
# Write protein and peptide data
|
|
256
279
|
concatenated_file_names = peptide_df.apply(_create_file_name, axis=1)
|
|
257
|
-
peptide_df.drop([
|
|
258
|
-
peptide_df[
|
|
280
|
+
peptide_df.drop(["FileName", "LowScan", "HighScan", "Charge"], axis=1, inplace=True)
|
|
281
|
+
peptide_df["FileName"] = concatenated_file_names
|
|
259
282
|
# Re-order columns to make FileName the second column
|
|
260
|
-
peptide_df = _reorder_columns(peptide_df,
|
|
283
|
+
peptide_df = _reorder_columns(peptide_df, "FileName", 1)
|
|
261
284
|
|
|
262
|
-
protein_data_str = protein_df.drop([
|
|
263
|
-
peptide_data_str = peptide_df.drop([
|
|
285
|
+
protein_data_str = protein_df.drop(["ProteinGroup"], axis=1).to_csv(header=False, index=False, sep="\t")
|
|
286
|
+
peptide_data_str = peptide_df.drop(["ProteinGroup"], axis=1).to_csv(header=False, index=False, sep="\t")
|
|
264
287
|
|
|
265
|
-
protein_data_str = protein_data_str.replace(
|
|
266
|
-
peptide_data_str = peptide_data_str.replace(
|
|
288
|
+
protein_data_str = protein_data_str.replace("\r", "")
|
|
289
|
+
peptide_data_str = peptide_data_str.replace("\r", "")
|
|
267
290
|
|
|
268
291
|
current_protein_grp = 0
|
|
269
|
-
protein_lines = protein_data_str.split(
|
|
270
|
-
peptide_lines = peptide_data_str.split(
|
|
292
|
+
protein_lines = protein_data_str.split("\n")
|
|
293
|
+
peptide_lines = peptide_data_str.split("\n")
|
|
271
294
|
|
|
272
|
-
if protein_lines[-1] ==
|
|
295
|
+
if protein_lines[-1] == "":
|
|
273
296
|
protein_lines = protein_lines[:-1]
|
|
274
297
|
|
|
275
|
-
if peptide_lines[-1] ==
|
|
298
|
+
if peptide_lines[-1] == "":
|
|
276
299
|
peptide_lines = peptide_lines[:-1]
|
|
277
300
|
|
|
278
301
|
protein_line_idx = 0
|
|
279
302
|
peptide_line_idx = 0
|
|
280
303
|
|
|
281
304
|
while protein_line_idx < len(protein_lines) and peptide_line_idx < len(peptide_lines):
|
|
282
|
-
if int(protein_df.iloc[protein_line_idx][
|
|
283
|
-
file_output.write(protein_lines[protein_line_idx] +
|
|
305
|
+
if int(protein_df.iloc[protein_line_idx]["ProteinGroup"]) == current_protein_grp:
|
|
306
|
+
file_output.write(protein_lines[protein_line_idx] + "\n")
|
|
284
307
|
protein_line_idx += 1
|
|
285
308
|
else:
|
|
286
|
-
file_output.write(peptide_lines[peptide_line_idx] +
|
|
309
|
+
file_output.write(peptide_lines[peptide_line_idx] + "\n")
|
|
287
310
|
peptide_line_idx += 1
|
|
288
311
|
if peptide_line_idx < len(peptide_lines) and int(
|
|
289
|
-
|
|
290
|
-
|
|
312
|
+
peptide_df.iloc[peptide_line_idx - 1]["ProteinGroup"]
|
|
313
|
+
) != int(peptide_df.iloc[peptide_line_idx]["ProteinGroup"]):
|
|
291
314
|
current_protein_grp += 1
|
|
292
315
|
|
|
293
316
|
# Write remaining protein and peptide lines
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: filterframes
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A very simple DTASelect-Filter.txt parser.
|
|
5
5
|
Author-email: Patrick Garrett <pgarrett@scripps.edu>
|
|
6
6
|
License: MIT License
|
|
@@ -25,15 +25,29 @@ License: MIT License
|
|
|
25
25
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
26
|
SOFTWARE.
|
|
27
27
|
|
|
28
|
-
Project-URL:
|
|
28
|
+
Project-URL: Repository, https://github.com/pgarrett-scripps/FilterFrames
|
|
29
|
+
Project-URL: Changelog, https://github.com/pgarrett-scripps/FilterFrames/blob/main/CHANGELOG.md
|
|
29
30
|
Keywords: IP2,PASER,Parser,Streamlit,DTASelect-filter,Peptide,Protein,Proteomics
|
|
30
31
|
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
31
37
|
Classifier: Development Status :: 4 - Beta
|
|
32
38
|
Classifier: License :: OSI Approved :: MIT License
|
|
33
39
|
Classifier: Operating System :: OS Independent
|
|
34
|
-
|
|
40
|
+
Classifier: Typing :: Typed
|
|
41
|
+
Requires-Python: >=3.9
|
|
35
42
|
Description-Content-Type: text/markdown
|
|
36
43
|
License-File: LICENSE
|
|
44
|
+
Requires-Dist: pandas>=1.5
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
47
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
48
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
49
|
+
Requires-Dist: ty; extra == "dev"
|
|
50
|
+
Dynamic: license-file
|
|
37
51
|
|
|
38
52
|

|
|
39
53
|

|