peek-bio 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. peek_bio-0.1.0/LICENSE +21 -0
  2. peek_bio-0.1.0/PKG-INFO +167 -0
  3. peek_bio-0.1.0/README.md +125 -0
  4. peek_bio-0.1.0/pyproject.toml +71 -0
  5. peek_bio-0.1.0/setup.cfg +4 -0
  6. peek_bio-0.1.0/src/peek_bio/__init__.py +3 -0
  7. peek_bio-0.1.0/src/peek_bio/__main__.py +4 -0
  8. peek_bio-0.1.0/src/peek_bio/cli.py +186 -0
  9. peek_bio-0.1.0/src/peek_bio/detect.py +264 -0
  10. peek_bio-0.1.0/src/peek_bio/display.py +194 -0
  11. peek_bio-0.1.0/src/peek_bio/formats/__init__.py +54 -0
  12. peek_bio-0.1.0/src/peek_bio/formats/bed.py +296 -0
  13. peek_bio-0.1.0/src/peek_bio/formats/bigwig.py +165 -0
  14. peek_bio-0.1.0/src/peek_bio/formats/csv_tsv.py +241 -0
  15. peek_bio-0.1.0/src/peek_bio/formats/excel.py +238 -0
  16. peek_bio-0.1.0/src/peek_bio/formats/fasta.py +169 -0
  17. peek_bio-0.1.0/src/peek_bio/formats/fastq.py +237 -0
  18. peek_bio-0.1.0/src/peek_bio/formats/gtf.py +223 -0
  19. peek_bio-0.1.0/src/peek_bio/formats/h5ad.py +109 -0
  20. peek_bio-0.1.0/src/peek_bio/formats/sam_bam.py +422 -0
  21. peek_bio-0.1.0/src/peek_bio/formats/vcf.py +299 -0
  22. peek_bio-0.1.0/src/peek_bio/utils.py +273 -0
  23. peek_bio-0.1.0/src/peek_bio.egg-info/PKG-INFO +167 -0
  24. peek_bio-0.1.0/src/peek_bio.egg-info/SOURCES.txt +35 -0
  25. peek_bio-0.1.0/src/peek_bio.egg-info/dependency_links.txt +1 -0
  26. peek_bio-0.1.0/src/peek_bio.egg-info/entry_points.txt +2 -0
  27. peek_bio-0.1.0/src/peek_bio.egg-info/requires.txt +21 -0
  28. peek_bio-0.1.0/src/peek_bio.egg-info/top_level.txt +1 -0
  29. peek_bio-0.1.0/tests/test_bed.py +62 -0
  30. peek_bio-0.1.0/tests/test_csv_tsv.py +151 -0
  31. peek_bio-0.1.0/tests/test_excel.py +38 -0
  32. peek_bio-0.1.0/tests/test_fasta.py +165 -0
  33. peek_bio-0.1.0/tests/test_fastq.py +216 -0
  34. peek_bio-0.1.0/tests/test_gtf.py +141 -0
  35. peek_bio-0.1.0/tests/test_sam_bam.py +287 -0
  36. peek_bio-0.1.0/tests/test_utils.py +222 -0
  37. peek_bio-0.1.0/tests/test_vcf.py +251 -0
peek_bio-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Patrick Wilson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.1
2
+ Name: peek-bio
3
+ Version: 0.1.0
4
+ Summary: Instant file previews for genomics data
5
+ Author: Patrick Wilson
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/pwilson97/peek-bio
8
+ Project-URL: Repository, https://github.com/pwilson97/peek-bio
9
+ Project-URL: Issues, https://github.com/pwilson97/peek-bio/issues
10
+ Keywords: bioinformatics,genomics,bam,vcf,fastq,fasta,bed,gtf,csv,preview,cli
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Operating System :: OS Independent
23
+ Classifier: Environment :: Console
24
+ Requires-Python: >=3.8
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Provides-Extra: excel
28
+ Requires-Dist: openpyxl>=3.0; extra == "excel"
29
+ Provides-Extra: h5ad
30
+ Requires-Dist: anndata>=0.8; extra == "h5ad"
31
+ Provides-Extra: bigwig
32
+ Requires-Dist: pyBigWig>=0.3; extra == "bigwig"
33
+ Provides-Extra: bam
34
+ Requires-Dist: pysam>=0.20; extra == "bam"
35
+ Provides-Extra: all
36
+ Requires-Dist: openpyxl>=3.0; extra == "all"
37
+ Requires-Dist: anndata>=0.8; extra == "all"
38
+ Requires-Dist: pyBigWig>=0.3; extra == "all"
39
+ Requires-Dist: pysam>=0.20; extra == "all"
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=6.0; extra == "dev"
42
+
43
+ # peek-bio
44
+
45
+ Instant file previews for genomics data. One command, any format.
46
+
47
+ ```
48
+ pip install peek-bio
49
+ ```
50
+
51
+ ## What it does
52
+
53
+ Point `peek` at a file and get a structured summary: row counts, column types,
54
+ quality scores, variant stats, mapping rates, QC warnings. No scripts, no
55
+ notebooks, no googling command flags.
56
+
57
+ ```
58
+ $ peek deseq2_results.csv
59
+
60
+ deseq2_results.csv — >10,553 x 7 (CSV, comma-separated)
61
+ ────────────────────────────────────────────────────────────────────
62
+ Columns:
63
+ str 0610005C13Rik, 0610009B22Rik, ... (1,000 unique)
64
+ baseMean float 3.92 … 1983.92 (median: 25.32, mean: 59.32) ⡇⡀⡀⡀⡀⡀⡀⡀⡀⡀
65
+ log2FoldChange float -3.29 … 3.60 (median: -0.02, mean: -0.04) ⡀⡀⡀⡀⡀⡇⡄⡀⡀⡀⡀⡀
66
+ lfcSE float 0.11 … 1.23 (median: 0.35, mean: 0.40) ⡄⡇⡆⡄⡄⡄⡀⡀⡀⡀⡀⡀
67
+ stat float -5.94 … 8.10 (median: -0.06, mean: -0.11) ⡀⡀⡀⡀⡇⡆⡄⡀⡀⡀⡀
68
+ pvalue float 5.46e-16 … 1.00 (median: 0.37, mean: 0.45) ⡇⡄⡄⡄⡄⡄⡄⡀⡄⡄⡄⡄
69
+ padj float 3.42e-13 … 1.00 (median: 0.95, mean: 0.81) ⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡇
70
+
71
+ Missing: pvalue (1)
72
+ ```
73
+
74
+ ```
75
+ $ peek NA12878.bam
76
+
77
+ NA12878.bam — 61,614 reads (BAM, indexed)
78
+ ────────────────────────────────────────────────────────────────────
79
+ Reference: 3366 sequences, 3.2 Gb [GRCh38 (with alts)]
80
+ Reads: 60,749 mapped (98.6%), 865 unmapped
81
+ Flags: 0.1% duplicates, 1.5% supplementary
82
+ Paired: yes (2×250 bp)
83
+ Insert size: mean 449 median 428 range 100–999 ⡀⡀⡀⡀⡆⡇⡄⡄⡄⡀⡀⡀
84
+ Read groups: 3 (NA12878, NA12878, NA12878)
85
+ Sort order: coordinate
86
+ Programs: bwamem, MarkDuplicates, GATK ApplyBQSR
87
+ MAPQ: mean 55.3 median 60 ⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡇
88
+ ```
89
+
90
+ ```
91
+ $ peek ERR188273_chrX_1.fq.gz
92
+
93
+ ERR188273_chrX_1.fq.gz — 30,531 reads, 2.3 Mb (FASTQ, Phred+33)
94
+ ────────────────────────────────────────────────────────────────────
95
+ Read length: all 75 bp
96
+ Quality: mean Q36.7 median Q38 range Q2–Q41 ⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡆⡇
97
+ GC content: 48.9%
98
+ ```
99
+
100
+ ```
101
+ $ peek clinvar.vcf.gz
102
+
103
+ clinvar.vcf.gz — 4,403,650 variants (VCF)
104
+ ────────────────────────────────────────────────────────────────────
105
+ Variants: 4,103,565 snps, 93,659 insertions, 194,377 deletions, 12,049 complexes
106
+ Ts/Tv: 1.69
107
+ FILTER: 4,403,650 PASS
108
+ Chroms: 32 total — top: 1 (398,195), 2 (384,641), 17 (265,676)
109
+ ```
110
+
111
+ ## Supported formats
112
+
113
+ **Core** (no extra dependencies):
114
+
115
+ | Format | Extensions |
116
+ |--------|-----------|
117
+ | CSV/TSV | `.csv`, `.tsv`, `.txt` |
118
+ | BED | `.bed`, `.narrowPeak`, `.broadPeak`, `.bedGraph` |
119
+ | FASTA | `.fa`, `.fasta` |
120
+ | FASTQ | `.fq`, `.fastq` |
121
+ | VCF | `.vcf`, `.vcf.gz` |
122
+ | GTF/GFF | `.gtf`, `.gff`, `.gff3` |
123
+
124
+ **Optional** (install what you need):
125
+
126
+ | Format | Extensions | Install |
127
+ |--------|-----------|---------|
128
+ | SAM/BAM/CRAM | `.sam`, `.bam`, `.cram` | `pip install peek-bio[bam]` |
129
+ | Excel | `.xlsx`, `.xls` | `pip install peek-bio[excel]` |
130
+ | BigWig | `.bw`, `.bigwig` | `pip install peek-bio[bigwig]` |
131
+ | H5AD | `.h5ad` | `pip install peek-bio[h5ad]` |
132
+
133
+ Or install everything: `pip install peek-bio[all]`
134
+
135
+ ## QC warnings
136
+
137
+ peek flags common issues automatically:
138
+
139
+ - Unusual GC content (outside 25-65%)
140
+ - High N content in assemblies (>20%)
141
+ - Low mean base quality in FASTQ (<Q20)
142
+ - Adapter contamination in FASTQ (>5%)
143
+ - Low mapping rate in BAM/SAM (<80%)
144
+ - Low MAPQ scores (<20 mean)
145
+ - High duplicate rate (>30%)
146
+ - Ts/Tv ratio out of range in VCF
147
+ - Low genotype rate in multi-sample VCF (<90%)
148
+ - No gene features or missing gene_id in GTF
149
+ - Single-chromosome GTF (possible subset)
150
+ - Columns with >50% missing data
151
+ - Mixed-type columns (numbers and strings mixed together)
152
+
153
+ ## Usage
154
+
155
+ ```
156
+ peek FILE [FILE ...] # preview one or more files
157
+ peek --head 20 FILE # show 20 preview rows instead of 5
158
+ peek --no-color FILE # plain text output (no ANSI colors)
159
+ peek --formats # list all supported formats + install status
160
+ peek --version # print version
161
+ ```
162
+
163
+ Compressed files (`.gz`) are handled transparently.
164
+
165
+ ## License
166
+
167
+ MIT
@@ -0,0 +1,125 @@
1
+ # peek-bio
2
+
3
+ Instant file previews for genomics data. One command, any format.
4
+
5
+ ```
6
+ pip install peek-bio
7
+ ```
8
+
9
+ ## What it does
10
+
11
+ Point `peek` at a file and get a structured summary: row counts, column types,
12
+ quality scores, variant stats, mapping rates, QC warnings. No scripts, no
13
+ notebooks, no googling command flags.
14
+
15
+ ```
16
+ $ peek deseq2_results.csv
17
+
18
+ deseq2_results.csv — >10,553 x 7 (CSV, comma-separated)
19
+ ────────────────────────────────────────────────────────────────────
20
+ Columns:
21
+ str 0610005C13Rik, 0610009B22Rik, ... (1,000 unique)
22
+ baseMean float 3.92 … 1983.92 (median: 25.32, mean: 59.32) ⡇⡀⡀⡀⡀⡀⡀⡀⡀⡀
23
+ log2FoldChange float -3.29 … 3.60 (median: -0.02, mean: -0.04) ⡀⡀⡀⡀⡀⡇⡄⡀⡀⡀⡀⡀
24
+ lfcSE float 0.11 … 1.23 (median: 0.35, mean: 0.40) ⡄⡇⡆⡄⡄⡄⡀⡀⡀⡀⡀⡀
25
+ stat float -5.94 … 8.10 (median: -0.06, mean: -0.11) ⡀⡀⡀⡀⡇⡆⡄⡀⡀⡀⡀
26
+ pvalue float 5.46e-16 … 1.00 (median: 0.37, mean: 0.45) ⡇⡄⡄⡄⡄⡄⡄⡀⡄⡄⡄⡄
27
+ padj float 3.42e-13 … 1.00 (median: 0.95, mean: 0.81) ⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡇
28
+
29
+ Missing: pvalue (1)
30
+ ```
31
+
32
+ ```
33
+ $ peek NA12878.bam
34
+
35
+ NA12878.bam — 61,614 reads (BAM, indexed)
36
+ ────────────────────────────────────────────────────────────────────
37
+ Reference: 3366 sequences, 3.2 Gb [GRCh38 (with alts)]
38
+ Reads: 60,749 mapped (98.6%), 865 unmapped
39
+ Flags: 0.1% duplicates, 1.5% supplementary
40
+ Paired: yes (2×250 bp)
41
+ Insert size: mean 449 median 428 range 100–999 ⡀⡀⡀⡀⡆⡇⡄⡄⡄⡀⡀⡀
42
+ Read groups: 3 (NA12878, NA12878, NA12878)
43
+ Sort order: coordinate
44
+ Programs: bwamem, MarkDuplicates, GATK ApplyBQSR
45
+ MAPQ: mean 55.3 median 60 ⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡇
46
+ ```
47
+
48
+ ```
49
+ $ peek ERR188273_chrX_1.fq.gz
50
+
51
+ ERR188273_chrX_1.fq.gz — 30,531 reads, 2.3 Mb (FASTQ, Phred+33)
52
+ ────────────────────────────────────────────────────────────────────
53
+ Read length: all 75 bp
54
+ Quality: mean Q36.7 median Q38 range Q2–Q41 ⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡀⡆⡇
55
+ GC content: 48.9%
56
+ ```
57
+
58
+ ```
59
+ $ peek clinvar.vcf.gz
60
+
61
+ clinvar.vcf.gz — 4,403,650 variants (VCF)
62
+ ────────────────────────────────────────────────────────────────────
63
+ Variants: 4,103,565 snps, 93,659 insertions, 194,377 deletions, 12,049 complexes
64
+ Ts/Tv: 1.69
65
+ FILTER: 4,403,650 PASS
66
+ Chroms: 32 total — top: 1 (398,195), 2 (384,641), 17 (265,676)
67
+ ```
68
+
69
+ ## Supported formats
70
+
71
+ **Core** (no extra dependencies):
72
+
73
+ | Format | Extensions |
74
+ |--------|-----------|
75
+ | CSV/TSV | `.csv`, `.tsv`, `.txt` |
76
+ | BED | `.bed`, `.narrowPeak`, `.broadPeak`, `.bedGraph` |
77
+ | FASTA | `.fa`, `.fasta` |
78
+ | FASTQ | `.fq`, `.fastq` |
79
+ | VCF | `.vcf`, `.vcf.gz` |
80
+ | GTF/GFF | `.gtf`, `.gff`, `.gff3` |
81
+
82
+ **Optional** (install what you need):
83
+
84
+ | Format | Extensions | Install |
85
+ |--------|-----------|---------|
86
+ | SAM/BAM/CRAM | `.sam`, `.bam`, `.cram` | `pip install peek-bio[bam]` |
87
+ | Excel | `.xlsx`, `.xls` | `pip install peek-bio[excel]` |
88
+ | BigWig | `.bw`, `.bigwig` | `pip install peek-bio[bigwig]` |
89
+ | H5AD | `.h5ad` | `pip install peek-bio[h5ad]` |
90
+
91
+ Or install everything: `pip install peek-bio[all]`
92
+
93
+ ## QC warnings
94
+
95
+ peek flags common issues automatically:
96
+
97
+ - Unusual GC content (outside 25-65%)
98
+ - High N content in assemblies (>20%)
99
+ - Low mean base quality in FASTQ (<Q20)
100
+ - Adapter contamination in FASTQ (>5%)
101
+ - Low mapping rate in BAM/SAM (<80%)
102
+ - Low MAPQ scores (<20 mean)
103
+ - High duplicate rate (>30%)
104
+ - Ts/Tv ratio out of range in VCF
105
+ - Low genotype rate in multi-sample VCF (<90%)
106
+ - No gene features or missing gene_id in GTF
107
+ - Single-chromosome GTF (possible subset)
108
+ - Columns with >50% missing data
109
+ - Mixed-type columns (numbers and strings mixed together)
110
+
111
+ ## Usage
112
+
113
+ ```
114
+ peek FILE [FILE ...] # preview one or more files
115
+ peek --head 20 FILE # show 20 preview rows instead of 5
116
+ peek --no-color FILE # plain text output (no ANSI colors)
117
+ peek --formats # list all supported formats + install status
118
+ peek --version # print version
119
+ ```
120
+
121
+ Compressed files (`.gz`) are handled transparently.
122
+
123
+ ## License
124
+
125
+ MIT
@@ -0,0 +1,71 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "peek-bio"
7
+ version = "0.1.0"
8
+ description = "Instant file previews for genomics data"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ { name = "Patrick Wilson" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Science/Research",
18
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Operating System :: OS Independent",
28
+ "Environment :: Console",
29
+ ]
30
+ keywords = [
31
+ "bioinformatics",
32
+ "genomics",
33
+ "bam",
34
+ "vcf",
35
+ "fastq",
36
+ "fasta",
37
+ "bed",
38
+ "gtf",
39
+ "csv",
40
+ "preview",
41
+ "cli",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/pwilson97/peek-bio"
46
+ Repository = "https://github.com/pwilson97/peek-bio"
47
+ Issues = "https://github.com/pwilson97/peek-bio/issues"
48
+
49
+ [project.scripts]
50
+ peek = "peek_bio.cli:main"
51
+
52
+ [project.optional-dependencies]
53
+ excel = ["openpyxl>=3.0"]
54
+ h5ad = ["anndata>=0.8"]
55
+ bigwig = ["pyBigWig>=0.3"]
56
+ bam = ["pysam>=0.20"]
57
+ all = [
58
+ "openpyxl>=3.0",
59
+ "anndata>=0.8",
60
+ "pyBigWig>=0.3",
61
+ "pysam>=0.20",
62
+ ]
63
+ dev = [
64
+ "pytest>=6.0",
65
+ ]
66
+
67
+ [tool.setuptools.packages.find]
68
+ where = ["src"]
69
+
70
+ [tool.pytest.ini_options]
71
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """peek-bio: Universal file preview for genomics data."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ """Allow running as `python -m peek_bio`."""
2
+ from peek_bio.cli import main
3
+
4
+ main()
@@ -0,0 +1,186 @@
1
+ """CLI entry point for peek."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+ from peek_bio import __version__
8
+ from peek_bio.detect import FormatType, detect_format, format_label
9
+ from peek_bio.display import Style, error, set_color
10
+ from peek_bio.formats import get_handler
11
+
12
+
13
+ def _supported_formats():
14
+ """List all supported formats and their install status."""
15
+ from peek_bio.display import separator
16
+
17
+ print(f" {Style.BOLD}peek-bio{Style.RESET} v{__version__} — supported formats:\n")
18
+
19
+ # Core (always available)
20
+ core = [
21
+ ("CSV/TSV", ".csv, .tsv, .txt", "built-in"),
22
+ ("BED", ".bed, .narrowPeak, .broadPeak, .bedGraph", "built-in"),
23
+ ("FASTA", ".fa, .fasta", "built-in"),
24
+ ("GTF/GFF", ".gtf, .gff, .gff3", "built-in"),
25
+ ("VCF", ".vcf, .vcf.gz", "built-in"),
26
+ ("FASTQ", ".fq, .fastq", "built-in"),
27
+ ]
28
+
29
+ # Optional
30
+ optional = []
31
+
32
+ try:
33
+ import openpyxl # noqa: F401
34
+ optional.append(("Excel", ".xlsx, .xls", f"{Style.GREEN}installed{Style.RESET}"))
35
+ except ImportError:
36
+ optional.append(("Excel", ".xlsx, .xls", f"{Style.YELLOW}pip install peek-bio[excel]{Style.RESET}"))
37
+
38
+ try:
39
+ import anndata # noqa: F401
40
+ optional.append(("H5AD", ".h5ad", f"{Style.GREEN}installed{Style.RESET}"))
41
+ except ImportError:
42
+ optional.append(("H5AD", ".h5ad", f"{Style.YELLOW}pip install peek-bio[h5ad]{Style.RESET}"))
43
+
44
+ try:
45
+ import pyBigWig # noqa: F401
46
+ optional.append(("BigWig", ".bw, .bigwig", f"{Style.GREEN}installed{Style.RESET}"))
47
+ except ImportError:
48
+ optional.append(("BigWig", ".bw, .bigwig", f"{Style.YELLOW}pip install peek-bio[bigwig]{Style.RESET}"))
49
+
50
+ try:
51
+ import pysam # noqa: F401
52
+ optional.append(("SAM/BAM", ".sam, .bam, .cram", f"{Style.GREEN}installed{Style.RESET}"))
53
+ except ImportError:
54
+ optional.append(("SAM/BAM", ".sam, .bam, .cram", f"{Style.YELLOW}pip install peek-bio[bam]{Style.RESET}"))
55
+
56
+ # Print
57
+ max_name = max(len(f[0]) for f in core + optional)
58
+ max_ext = max(len(f[1]) for f in core + optional)
59
+
60
+ for name, ext, status in core:
61
+ print(f" {Style.BOLD}{name:<{max_name}}{Style.RESET} {Style.DIM}{ext:<{max_ext}}{Style.RESET} {status}")
62
+ print()
63
+ for name, ext, status in optional:
64
+ print(f" {Style.BOLD}{name:<{max_name}}{Style.RESET} {Style.DIM}{ext:<{max_ext}}{Style.RESET} {status}")
65
+
66
+ # Coming soon (placeholder for future formats)
67
+ # print(f"\n {Style.DIM}Coming soon: ...{Style.RESET}")
68
+
69
+
70
+ def _peek_one(path, opts):
71
+ """Preview a single file."""
72
+ if not os.path.exists(path):
73
+ error(f"file not found: {path}")
74
+ return False
75
+
76
+ if os.path.isdir(path):
77
+ error(f"is a directory: {path}")
78
+ return False
79
+
80
+ # Skip index / auxiliary files with a friendly message
81
+ lower = os.path.basename(path).lower()
82
+ _index_labels = {
83
+ ".bai": "BAM index", ".csi": "CSI index", ".tbi": "tabix index",
84
+ ".fai": "FASTA index", ".idx": "index", ".gzi": "bgzip index",
85
+ }
86
+ for iext, ilabel in _index_labels.items():
87
+ if lower.endswith(iext):
88
+ # Silently skip — these are companion files, not data
89
+ return True
90
+
91
+ fmt, is_compressed = detect_format(path)
92
+
93
+ if fmt == FormatType.UNKNOWN:
94
+ error(f"unrecognized format: {path}")
95
+ return False
96
+
97
+ handler = get_handler(fmt)
98
+ if handler is None:
99
+ # Format recognized but handler not available (missing optional dep)
100
+ label = format_label(fmt)
101
+ error(f"{label} format detected but handler not available")
102
+ dep_hints = {
103
+ FormatType.EXCEL: "pip install peek-bio[excel]",
104
+ FormatType.H5AD: "pip install peek-bio[h5ad]",
105
+ FormatType.BIGWIG: "pip install peek-bio[bigwig]",
106
+ FormatType.SAM: "pip install peek-bio[bam]",
107
+ FormatType.BAM: "pip install peek-bio[bam]",
108
+ FormatType.CRAM: "pip install peek-bio[bam]",
109
+ }
110
+ hint = dep_hints.get(fmt)
111
+ if hint:
112
+ print(f" Run: {hint}", file=sys.stderr)
113
+ return False
114
+
115
+ try:
116
+ handler(path, is_compressed, opts)
117
+ except Exception as exc:
118
+ error(f"failed to read {path}: {exc}")
119
+ return False
120
+
121
+ return True
122
+
123
+
124
+ def main(argv=None):
125
+ """Main CLI entry point."""
126
+ parser = argparse.ArgumentParser(
127
+ prog="peek",
128
+ description="Universal file preview for genomics data",
129
+ )
130
+ parser.add_argument(
131
+ "files",
132
+ nargs="*",
133
+ help="file(s) to preview",
134
+ )
135
+ parser.add_argument(
136
+ "--head", "-n",
137
+ type=int,
138
+ default=5,
139
+ help="number of preview rows to show (default: 5)",
140
+ )
141
+ parser.add_argument(
142
+ "--no-color",
143
+ action="store_true",
144
+ help="disable colored output",
145
+ )
146
+ parser.add_argument(
147
+ "--formats",
148
+ action="store_true",
149
+ help="list supported file formats",
150
+ )
151
+ parser.add_argument(
152
+ "--version", "-v",
153
+ action="version",
154
+ version=f"peek-bio {__version__}",
155
+ )
156
+
157
+ args = parser.parse_args(argv)
158
+
159
+ if args.no_color:
160
+ set_color(False)
161
+
162
+ if args.formats:
163
+ _supported_formats()
164
+ return
165
+
166
+ if not args.files:
167
+ parser.print_help()
168
+ sys.exit(1)
169
+
170
+ opts = {
171
+ "head": args.head,
172
+ }
173
+ success = True
174
+
175
+ for i, path in enumerate(args.files):
176
+ if i > 0:
177
+ print() # Blank line between files
178
+ if not _peek_one(path, opts):
179
+ success = False
180
+
181
+ if not success:
182
+ sys.exit(1)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()