fasta_read 1.1.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +30 -20
- data/bin/fasta_read +38 -10
- data/features/fasta_read.feature +11 -13
- data/lib/fasta_read/sequence.rb +1 -3
- data/lib/fasta_read/version.rb +1 -1
- data/spec/fasta_read/sequence_spec.rb +4 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0610fba4a055d83670705afed69fd6a98603749b
|
4
|
+
data.tar.gz: 5203a61fa111d3c29f73fc4396cec5f3d9a43070
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6bf8413263aeab223c0afd9e3e52a491d7c12c3f007438bb4d8ee4d39c19b626ef6275831f658ef2ab8203b8a7818c9e62626e7e275e2b2ab544f7c819dbf60
|
7
|
+
data.tar.gz: a1d0463b0a048cbfbce9b9196f0cce7ca73f94737e226092efe3db9ae91a74fb35f4d862aa3f747bfc4646d3c0d32e48e79a0e2782dac8dbaa7d073b49c18f7e
|
data/.travis.yml
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -24,7 +24,7 @@ Or install it yourself as:
|
|
24
24
|
## Example
|
25
25
|
|
26
26
|
fasta_read hg19 chr12 112123514 112123790 --output=out.txt
|
27
|
-
|
27
|
+
|
28
28
|
Options:
|
29
29
|
-h, --help Show command line help
|
30
30
|
-o, --output OUTPUTFILE outputs sequence to a file
|
@@ -39,7 +39,7 @@ Or install it yourself as:
|
|
39
39
|
assembly
|
40
40
|
assembly name (hg19, mm10, etc.)
|
41
41
|
chromosome
|
42
|
-
id of chromosome (
|
42
|
+
id of chromosome (chr1-chr22 or chrx/chry)
|
43
43
|
cstart
|
44
44
|
start coordinate (inclusive) within the chromosome
|
45
45
|
cend
|
@@ -51,30 +51,40 @@ stdout: Extracted sequence (only)
|
|
51
51
|
|
52
52
|
stderr: Any errors.
|
53
53
|
|
54
|
-
using --output option
|
54
|
+
using --output option exports the sequence to a file
|
55
55
|
|
56
56
|
## Supporting Requirements
|
57
57
|
|
58
58
|
The program depends on being run at the top of a directory tree containing .fa files. The .fa files should be of the form where each file maps to a single chomosome.
|
59
|
-
|
60
59
|
The directory tree will have separate branches for SNPs and unmasked files.
|
61
60
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
61
|
+
For example, provided the following tree the command expects to be run inside the 'fasta' directory:
|
62
|
+
|
63
|
+
fasta
|
64
|
+
├── hg19
|
65
|
+
│ ├── snp
|
66
|
+
│ │ ├── chr1.subst.fa
|
67
|
+
| │ ├── chr2.subst.fa
|
68
|
+
| │ ├── chr3.subst.fa
|
69
|
+
| │ └── ....
|
70
|
+
│ └── unmasked
|
71
|
+
│ ├── chr1.fa
|
72
|
+
| ├── chr2.fa
|
73
|
+
| ├── chr3.fa
|
74
|
+
| └── ....
|
75
|
+
└── mm10
|
76
|
+
├── snp
|
77
|
+
│ ├── chr1.subst.fa
|
78
|
+
│ ├── chr2.subst.fa
|
79
|
+
│ ├── chr3.subst.fa
|
80
|
+
│ └── ....
|
81
|
+
└── unmasked
|
82
|
+
├── chr1.fa
|
83
|
+
├── chr2.fa
|
84
|
+
├── chr3.fa
|
85
|
+
└── ....
|
86
|
+
|
87
|
+
Fasta files can be downloaded at:
|
78
88
|
|
79
89
|
http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/
|
80
90
|
http://hgdownload.cse.ucsc.edu/goldenPath/hg19/snp138Mask/
|
data/bin/fasta_read
CHANGED
@@ -9,12 +9,10 @@ class App
|
|
9
9
|
include Methadone::CLILogging
|
10
10
|
|
11
11
|
main do |assembly, chromosome, cstart, cend|
|
12
|
-
|
13
|
-
subst = masked_unmasked(options)[1]
|
14
|
-
path = "fasta/#{assembly}/#{snp}/chr#{chromosome + subst}.fa"
|
12
|
+
path = "#{assembly}/#{snp_or_unmasked(options)[0]}/#{chromosome + snp_or_unmasked(options)[1]}.fa"
|
15
13
|
exit_now_with!(assembly_or_chromosome(assembly, chromosome)) unless File.exist?(path)
|
16
14
|
fasta = IO.read(path)
|
17
|
-
debug("About to read #{assembly} #{
|
15
|
+
debug("About to read #{assembly} #{snp_or_unmasked(options)[0]} chr#{chromosome}:#{cstart}-#{cend}")
|
18
16
|
sequence = FastaRead::Sequence.new(fasta, chromosome, cstart, cend).process
|
19
17
|
if options[:output]
|
20
18
|
File.open(options[:output], "w") do |file|
|
@@ -26,12 +24,12 @@ class App
|
|
26
24
|
info "#{sequence.length} base pairs"
|
27
25
|
end
|
28
26
|
|
29
|
-
def self.
|
27
|
+
def self.snp_or_unmasked(options)
|
30
28
|
options[:snp] ? ["snp", ".subst"] : ["unmasked", ""]
|
31
29
|
end
|
32
30
|
|
33
31
|
def self.assembly_or_chromosome(assembly, chromosome)
|
34
|
-
Dir.exist?("
|
32
|
+
Dir.exist?("#{assembly}") ? [chromosome, "chromosome"] : [assembly, "assembly"]
|
35
33
|
end
|
36
34
|
|
37
35
|
def self.exit_now_with!(value_and_argument)
|
@@ -41,17 +39,47 @@ class App
|
|
41
39
|
|
42
40
|
# Declare command-line interface here
|
43
41
|
|
44
|
-
description
|
42
|
+
description <<-EOF
|
43
|
+
Extract DNA Fasta sequence from assembly files.
|
44
|
+
|
45
|
+
The program depends on being run at the top of a directory tree containing .fa files. The .fa files should be of the form where each file maps to a single chomosome.
|
46
|
+
The directory tree will have separate branches for SNPs and unmasked files.
|
47
|
+
|
48
|
+
For example, provided the following tree the command expects to be run inside the 'fasta' directory:
|
49
|
+
|
50
|
+
└── fasta
|
51
|
+
├── hg19
|
52
|
+
│ ├── snp
|
53
|
+
│ │ ├── chr1.subst.fa
|
54
|
+
| │ ├── chr2.subst.fa
|
55
|
+
| │ ├── chr3.subst.fa
|
56
|
+
| │ └── ....
|
57
|
+
│ └── unmasked
|
58
|
+
│ ├── chr1.fa
|
59
|
+
| ├── chr2.fa
|
60
|
+
| ├── chr3.fa
|
61
|
+
| └── ....
|
62
|
+
└── mm10
|
63
|
+
├── snp
|
64
|
+
│ ├── chr1.subst.fa
|
65
|
+
│ ├── chr2.subst.fa
|
66
|
+
│ ├── chr3.subst.fa
|
67
|
+
│ └── ....
|
68
|
+
└── unmasked
|
69
|
+
├── chr1.fa
|
70
|
+
├── chr2.fa
|
71
|
+
├── chr3.fa
|
72
|
+
└── ....
|
73
|
+
EOF
|
74
|
+
|
45
75
|
#
|
46
76
|
# Accept flags via:
|
47
77
|
# on("--flag VAL","Some flag")
|
48
78
|
# options[flag] will contain VAL
|
49
|
-
options['multiple-lines'] = true
|
50
79
|
#
|
51
80
|
# Specify switches via:
|
52
81
|
on "-o OUTPUTFILE", "--output", "outputs sequence to a file"
|
53
82
|
on "--snp", "return the sequence from the SNP-masked assembly"
|
54
|
-
on "--multiple-lines", "return the sequence from the SNP-masked assembly"
|
55
83
|
|
56
84
|
# on("--[no-]switch","Some switch")
|
57
85
|
#
|
@@ -59,7 +87,7 @@ class App
|
|
59
87
|
#
|
60
88
|
# Require an argument
|
61
89
|
arg :assembly, "assembly name (hg19, mm10, etc.)"
|
62
|
-
arg :chromosome, "id of chromosome (
|
90
|
+
arg :chromosome, "id of chromosome (chr1-chr22 or chrx/chry)"
|
63
91
|
arg :cstart, "start coordinate (inclusive) within the chromosome"
|
64
92
|
arg :cend, "end coordinate within the chromosome"
|
65
93
|
#
|
data/features/fasta_read.feature
CHANGED
@@ -4,7 +4,7 @@ Feature: My bootstrapped app kinda works
|
|
4
4
|
So I don't have to do it myself
|
5
5
|
|
6
6
|
Background:
|
7
|
-
Given a file named "
|
7
|
+
Given a file named "hg19/unmasked/chr1.fa" with:
|
8
8
|
"""
|
9
9
|
>chr1
|
10
10
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
@@ -13,7 +13,7 @@ Feature: My bootstrapped app kinda works
|
|
13
13
|
GTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT
|
14
14
|
C
|
15
15
|
"""
|
16
|
-
And a file named "
|
16
|
+
And a file named "hg19/snp/chr1.subst.fa" with:
|
17
17
|
"""
|
18
18
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
19
19
|
CTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACC
|
@@ -26,7 +26,6 @@ Feature: My bootstrapped app kinda works
|
|
26
26
|
When I get help for "fasta_read"
|
27
27
|
Then the exit status should be 0
|
28
28
|
And the banner should be present
|
29
|
-
And there should be a one line summary of what the app does
|
30
29
|
And the banner should include the version
|
31
30
|
And the banner should document that this app takes options
|
32
31
|
And the following options should be documented:
|
@@ -41,7 +40,7 @@ Feature: My bootstrapped app kinda works
|
|
41
40
|
|cend |which is required|
|
42
41
|
|
43
42
|
Scenario Outline: files with one line stream
|
44
|
-
Given a file named "
|
43
|
+
Given a file named "hg19/unmasked/chr1.fa" with:
|
45
44
|
"""
|
46
45
|
>chr1
|
47
46
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
@@ -50,7 +49,7 @@ Feature: My bootstrapped app kinda works
|
|
50
49
|
GTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT
|
51
50
|
C
|
52
51
|
"""
|
53
|
-
And a file named "
|
52
|
+
And a file named "hg19/snp/chr1.subst.fa" with:
|
54
53
|
"""
|
55
54
|
>chr1
|
56
55
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
@@ -61,17 +60,16 @@ Feature: My bootstrapped app kinda works
|
|
61
60
|
"""
|
62
61
|
When I successfully run `fasta_read <options>`
|
63
62
|
Then the output should contain "<output>"
|
64
|
-
Then the output should not contain "<noutput>"
|
65
63
|
|
66
64
|
Scenarios: unmasked
|
67
|
-
|options
|
68
|
-
|hg19
|
65
|
+
|options |output |
|
66
|
+
|hg19 chr1 0 200|AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT|
|
69
67
|
Scenarios: snp
|
70
|
-
|options |output |
|
71
|
-
|hg19
|
68
|
+
|options |output |
|
69
|
+
|hg19 chr1 0 200 --snp|AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT|
|
72
70
|
|
73
71
|
Scenario: Export to file
|
74
|
-
When I successfully run `fasta_read hg19
|
72
|
+
When I successfully run `fasta_read hg19 chr1 0 200 --output=out.txt`
|
75
73
|
Then the file "out.txt" should contain:
|
76
74
|
"""
|
77
75
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT
|
@@ -82,5 +80,5 @@ Feature: My bootstrapped app kinda works
|
|
82
80
|
Then the stderr should contain "<output>"
|
83
81
|
Scenarios: incorrect assembly/chromosome
|
84
82
|
|options |output |
|
85
|
-
|foo
|
86
|
-
|hg19
|
83
|
+
|foo chr12 0 3 --log-level=debug |the 'foo' assembly doesn't exist in directory structure |
|
84
|
+
|hg19 chr99 0 3 --log-level=debug|the 'chr99' chromosome doesn't exist in directory structure|
|
data/lib/fasta_read/sequence.rb
CHANGED
@@ -5,11 +5,9 @@ module FastaRead
|
|
5
5
|
include Methadone::Main
|
6
6
|
include Methadone::CLILogging
|
7
7
|
|
8
|
-
attr_reader :separate_lines
|
9
|
-
|
10
8
|
def initialize(fasta, chromosome, cstart, cend)
|
11
9
|
@fasta = fasta.gsub("\n", "")
|
12
|
-
@chromosome = "
|
10
|
+
@chromosome = ">#{chromosome}"
|
13
11
|
@cstart = cstart.to_i
|
14
12
|
@cend = cend.to_i
|
15
13
|
end
|
data/lib/fasta_read/version.rb
CHANGED
@@ -5,13 +5,13 @@ module FastaRead
|
|
5
5
|
let(:fasta) {">chr1\nAATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT\nCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACC\nACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTG\nGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT\nC"}
|
6
6
|
describe "#real_cstart" do
|
7
7
|
context "when @cstart is > 0" do
|
8
|
-
subject {Sequence.new(fasta, "
|
8
|
+
subject {Sequence.new(fasta, "chr12", "5", "200").real_cstart}
|
9
9
|
it "should add file header string length" do
|
10
10
|
expect(subject).to eq 10
|
11
11
|
end
|
12
12
|
end
|
13
13
|
context "when @cstart is 0" do
|
14
|
-
subject {Sequence.new(fasta, "
|
14
|
+
subject {Sequence.new(fasta, "chr1", "0", "200").real_cstart}
|
15
15
|
it "should not subtract 1" do
|
16
16
|
expect(subject).to eq 5
|
17
17
|
end
|
@@ -19,14 +19,14 @@ module FastaRead
|
|
19
19
|
end
|
20
20
|
|
21
21
|
describe "#real_cend" do
|
22
|
-
subject {Sequence.new(fasta, "
|
22
|
+
subject {Sequence.new(fasta, "chr1", "0", "200").real_cend}
|
23
23
|
it "should subtract 1 and add file header string length" do
|
24
24
|
expect(subject).to eq 204
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
28
|
describe "#process" do
|
29
|
-
subject {Sequence.new(fasta, "
|
29
|
+
subject {Sequence.new(fasta, "chr1", "0", "200").process}
|
30
30
|
describe "with a continuous character stream" do
|
31
31
|
it "builds a character sequence" do
|
32
32
|
expect(subject).to eq "AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fasta_read
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrea D'Amico
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: methadone
|