fasta_read 1.1.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +30 -20
- data/bin/fasta_read +38 -10
- data/features/fasta_read.feature +11 -13
- data/lib/fasta_read/sequence.rb +1 -3
- data/lib/fasta_read/version.rb +1 -1
- data/spec/fasta_read/sequence_spec.rb +4 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0610fba4a055d83670705afed69fd6a98603749b
|
4
|
+
data.tar.gz: 5203a61fa111d3c29f73fc4396cec5f3d9a43070
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6bf8413263aeab223c0afd9e3e52a491d7c12c3f007438bb4d8ee4d39c19b626ef6275831f658ef2ab8203b8a7818c9e62626e7e275e2b2ab544f7c819dbf60
|
7
|
+
data.tar.gz: a1d0463b0a048cbfbce9b9196f0cce7ca73f94737e226092efe3db9ae91a74fb35f4d862aa3f747bfc4646d3c0d32e48e79a0e2782dac8dbaa7d073b49c18f7e
|
data/.travis.yml
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -24,7 +24,7 @@ Or install it yourself as:
|
|
24
24
|
## Example
|
25
25
|
|
26
26
|
fasta_read hg19 chr12 112123514 112123790 --output=out.txt
|
27
|
-
|
27
|
+
|
28
28
|
Options:
|
29
29
|
-h, --help Show command line help
|
30
30
|
-o, --output OUTPUTFILE outputs sequence to a file
|
@@ -39,7 +39,7 @@ Or install it yourself as:
|
|
39
39
|
assembly
|
40
40
|
assembly name (hg19, mm10, etc.)
|
41
41
|
chromosome
|
42
|
-
id of chromosome (
|
42
|
+
id of chromosome (chr1-chr22 or chrx/chry)
|
43
43
|
cstart
|
44
44
|
start coordinate (inclusive) within the chromosome
|
45
45
|
cend
|
@@ -51,30 +51,40 @@ stdout: Extracted sequence (only)
|
|
51
51
|
|
52
52
|
stderr: Any errors.
|
53
53
|
|
54
|
-
using --output option
|
54
|
+
using --output option exports the sequence to a file
|
55
55
|
|
56
56
|
## Supporting Requirements
|
57
57
|
|
58
58
|
The program depends on being run at the top of a directory tree containing .fa files. The .fa files should be of the form where each file maps to a single chomosome.
|
59
|
-
|
60
59
|
The directory tree will have separate branches for SNPs and unmasked files.
|
61
60
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
61
|
+
For example, provided the following tree the command expects to be run inside the 'fasta' directory:
|
62
|
+
|
63
|
+
fasta
|
64
|
+
├── hg19
|
65
|
+
│ ├── snp
|
66
|
+
│ │ ├── chr1.subst.fa
|
67
|
+
| │ ├── chr2.subst.fa
|
68
|
+
| │ ├── chr3.subst.fa
|
69
|
+
| │ └── ....
|
70
|
+
│ └── unmasked
|
71
|
+
│ ├── chr1.fa
|
72
|
+
| ├── chr2.fa
|
73
|
+
| ├── chr3.fa
|
74
|
+
| └── ....
|
75
|
+
└── mm10
|
76
|
+
├── snp
|
77
|
+
│ ├── chr1.subst.fa
|
78
|
+
│ ├── chr2.subst.fa
|
79
|
+
│ ├── chr3.subst.fa
|
80
|
+
│ └── ....
|
81
|
+
└── unmasked
|
82
|
+
├── chr1.fa
|
83
|
+
├── chr2.fa
|
84
|
+
├── chr3.fa
|
85
|
+
└── ....
|
86
|
+
|
87
|
+
Fasta files can be downloaded at:
|
78
88
|
|
79
89
|
http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/
|
80
90
|
http://hgdownload.cse.ucsc.edu/goldenPath/hg19/snp138Mask/
|
data/bin/fasta_read
CHANGED
@@ -9,12 +9,10 @@ class App
|
|
9
9
|
include Methadone::CLILogging
|
10
10
|
|
11
11
|
main do |assembly, chromosome, cstart, cend|
|
12
|
-
|
13
|
-
subst = masked_unmasked(options)[1]
|
14
|
-
path = "fasta/#{assembly}/#{snp}/chr#{chromosome + subst}.fa"
|
12
|
+
path = "#{assembly}/#{snp_or_unmasked(options)[0]}/#{chromosome + snp_or_unmasked(options)[1]}.fa"
|
15
13
|
exit_now_with!(assembly_or_chromosome(assembly, chromosome)) unless File.exist?(path)
|
16
14
|
fasta = IO.read(path)
|
17
|
-
debug("About to read #{assembly} #{
|
15
|
+
debug("About to read #{assembly} #{snp_or_unmasked(options)[0]} chr#{chromosome}:#{cstart}-#{cend}")
|
18
16
|
sequence = FastaRead::Sequence.new(fasta, chromosome, cstart, cend).process
|
19
17
|
if options[:output]
|
20
18
|
File.open(options[:output], "w") do |file|
|
@@ -26,12 +24,12 @@ class App
|
|
26
24
|
info "#{sequence.length} base pairs"
|
27
25
|
end
|
28
26
|
|
29
|
-
def self.
|
27
|
+
def self.snp_or_unmasked(options)
|
30
28
|
options[:snp] ? ["snp", ".subst"] : ["unmasked", ""]
|
31
29
|
end
|
32
30
|
|
33
31
|
def self.assembly_or_chromosome(assembly, chromosome)
|
34
|
-
Dir.exist?("
|
32
|
+
Dir.exist?("#{assembly}") ? [chromosome, "chromosome"] : [assembly, "assembly"]
|
35
33
|
end
|
36
34
|
|
37
35
|
def self.exit_now_with!(value_and_argument)
|
@@ -41,17 +39,47 @@ class App
|
|
41
39
|
|
42
40
|
# Declare command-line interface here
|
43
41
|
|
44
|
-
description
|
42
|
+
description <<-EOF
|
43
|
+
Extract DNA Fasta sequence from assembly files.
|
44
|
+
|
45
|
+
The program depends on being run at the top of a directory tree containing .fa files. The .fa files should be of the form where each file maps to a single chomosome.
|
46
|
+
The directory tree will have separate branches for SNPs and unmasked files.
|
47
|
+
|
48
|
+
For example, provided the following tree the command expects to be run inside the 'fasta' directory:
|
49
|
+
|
50
|
+
└── fasta
|
51
|
+
├── hg19
|
52
|
+
│ ├── snp
|
53
|
+
│ │ ├── chr1.subst.fa
|
54
|
+
| │ ├── chr2.subst.fa
|
55
|
+
| │ ├── chr3.subst.fa
|
56
|
+
| │ └── ....
|
57
|
+
│ └── unmasked
|
58
|
+
│ ├── chr1.fa
|
59
|
+
| ├── chr2.fa
|
60
|
+
| ├── chr3.fa
|
61
|
+
| └── ....
|
62
|
+
└── mm10
|
63
|
+
├── snp
|
64
|
+
│ ├── chr1.subst.fa
|
65
|
+
│ ├── chr2.subst.fa
|
66
|
+
│ ├── chr3.subst.fa
|
67
|
+
│ └── ....
|
68
|
+
└── unmasked
|
69
|
+
├── chr1.fa
|
70
|
+
├── chr2.fa
|
71
|
+
├── chr3.fa
|
72
|
+
└── ....
|
73
|
+
EOF
|
74
|
+
|
45
75
|
#
|
46
76
|
# Accept flags via:
|
47
77
|
# on("--flag VAL","Some flag")
|
48
78
|
# options[flag] will contain VAL
|
49
|
-
options['multiple-lines'] = true
|
50
79
|
#
|
51
80
|
# Specify switches via:
|
52
81
|
on "-o OUTPUTFILE", "--output", "outputs sequence to a file"
|
53
82
|
on "--snp", "return the sequence from the SNP-masked assembly"
|
54
|
-
on "--multiple-lines", "return the sequence from the SNP-masked assembly"
|
55
83
|
|
56
84
|
# on("--[no-]switch","Some switch")
|
57
85
|
#
|
@@ -59,7 +87,7 @@ class App
|
|
59
87
|
#
|
60
88
|
# Require an argument
|
61
89
|
arg :assembly, "assembly name (hg19, mm10, etc.)"
|
62
|
-
arg :chromosome, "id of chromosome (
|
90
|
+
arg :chromosome, "id of chromosome (chr1-chr22 or chrx/chry)"
|
63
91
|
arg :cstart, "start coordinate (inclusive) within the chromosome"
|
64
92
|
arg :cend, "end coordinate within the chromosome"
|
65
93
|
#
|
data/features/fasta_read.feature
CHANGED
@@ -4,7 +4,7 @@ Feature: My bootstrapped app kinda works
|
|
4
4
|
So I don't have to do it myself
|
5
5
|
|
6
6
|
Background:
|
7
|
-
Given a file named "
|
7
|
+
Given a file named "hg19/unmasked/chr1.fa" with:
|
8
8
|
"""
|
9
9
|
>chr1
|
10
10
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
@@ -13,7 +13,7 @@ Feature: My bootstrapped app kinda works
|
|
13
13
|
GTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT
|
14
14
|
C
|
15
15
|
"""
|
16
|
-
And a file named "
|
16
|
+
And a file named "hg19/snp/chr1.subst.fa" with:
|
17
17
|
"""
|
18
18
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
19
19
|
CTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACC
|
@@ -26,7 +26,6 @@ Feature: My bootstrapped app kinda works
|
|
26
26
|
When I get help for "fasta_read"
|
27
27
|
Then the exit status should be 0
|
28
28
|
And the banner should be present
|
29
|
-
And there should be a one line summary of what the app does
|
30
29
|
And the banner should include the version
|
31
30
|
And the banner should document that this app takes options
|
32
31
|
And the following options should be documented:
|
@@ -41,7 +40,7 @@ Feature: My bootstrapped app kinda works
|
|
41
40
|
|cend |which is required|
|
42
41
|
|
43
42
|
Scenario Outline: files with one line stream
|
44
|
-
Given a file named "
|
43
|
+
Given a file named "hg19/unmasked/chr1.fa" with:
|
45
44
|
"""
|
46
45
|
>chr1
|
47
46
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
@@ -50,7 +49,7 @@ Feature: My bootstrapped app kinda works
|
|
50
49
|
GTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT
|
51
50
|
C
|
52
51
|
"""
|
53
|
-
And a file named "
|
52
|
+
And a file named "hg19/snp/chr1.subst.fa" with:
|
54
53
|
"""
|
55
54
|
>chr1
|
56
55
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT
|
@@ -61,17 +60,16 @@ Feature: My bootstrapped app kinda works
|
|
61
60
|
"""
|
62
61
|
When I successfully run `fasta_read <options>`
|
63
62
|
Then the output should contain "<output>"
|
64
|
-
Then the output should not contain "<noutput>"
|
65
63
|
|
66
64
|
Scenarios: unmasked
|
67
|
-
|options
|
68
|
-
|hg19
|
65
|
+
|options |output |
|
66
|
+
|hg19 chr1 0 200|AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT|
|
69
67
|
Scenarios: snp
|
70
|
-
|options |output |
|
71
|
-
|hg19
|
68
|
+
|options |output |
|
69
|
+
|hg19 chr1 0 200 --snp|AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT|
|
72
70
|
|
73
71
|
Scenario: Export to file
|
74
|
-
When I successfully run `fasta_read hg19
|
72
|
+
When I successfully run `fasta_read hg19 chr1 0 200 --output=out.txt`
|
75
73
|
Then the file "out.txt" should contain:
|
76
74
|
"""
|
77
75
|
AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT
|
@@ -82,5 +80,5 @@ Feature: My bootstrapped app kinda works
|
|
82
80
|
Then the stderr should contain "<output>"
|
83
81
|
Scenarios: incorrect assembly/chromosome
|
84
82
|
|options |output |
|
85
|
-
|foo
|
86
|
-
|hg19
|
83
|
+
|foo chr12 0 3 --log-level=debug |the 'foo' assembly doesn't exist in directory structure |
|
84
|
+
|hg19 chr99 0 3 --log-level=debug|the 'chr99' chromosome doesn't exist in directory structure|
|
data/lib/fasta_read/sequence.rb
CHANGED
@@ -5,11 +5,9 @@ module FastaRead
|
|
5
5
|
include Methadone::Main
|
6
6
|
include Methadone::CLILogging
|
7
7
|
|
8
|
-
attr_reader :separate_lines
|
9
|
-
|
10
8
|
def initialize(fasta, chromosome, cstart, cend)
|
11
9
|
@fasta = fasta.gsub("\n", "")
|
12
|
-
@chromosome = "
|
10
|
+
@chromosome = ">#{chromosome}"
|
13
11
|
@cstart = cstart.to_i
|
14
12
|
@cend = cend.to_i
|
15
13
|
end
|
data/lib/fasta_read/version.rb
CHANGED
@@ -5,13 +5,13 @@ module FastaRead
|
|
5
5
|
let(:fasta) {">chr1\nAATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTT\nCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACC\nACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTG\nGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT\nC"}
|
6
6
|
describe "#real_cstart" do
|
7
7
|
context "when @cstart is > 0" do
|
8
|
-
subject {Sequence.new(fasta, "
|
8
|
+
subject {Sequence.new(fasta, "chr12", "5", "200").real_cstart}
|
9
9
|
it "should add file header string length" do
|
10
10
|
expect(subject).to eq 10
|
11
11
|
end
|
12
12
|
end
|
13
13
|
context "when @cstart is 0" do
|
14
|
-
subject {Sequence.new(fasta, "
|
14
|
+
subject {Sequence.new(fasta, "chr1", "0", "200").real_cstart}
|
15
15
|
it "should not subtract 1" do
|
16
16
|
expect(subject).to eq 5
|
17
17
|
end
|
@@ -19,14 +19,14 @@ module FastaRead
|
|
19
19
|
end
|
20
20
|
|
21
21
|
describe "#real_cend" do
|
22
|
-
subject {Sequence.new(fasta, "
|
22
|
+
subject {Sequence.new(fasta, "chr1", "0", "200").real_cend}
|
23
23
|
it "should subtract 1 and add file header string length" do
|
24
24
|
expect(subject).to eq 204
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
28
|
describe "#process" do
|
29
|
-
subject {Sequence.new(fasta, "
|
29
|
+
subject {Sequence.new(fasta, "chr1", "0", "200").process}
|
30
30
|
describe "with a continuous character stream" do
|
31
31
|
it "builds a character sequence" do
|
32
32
|
expect(subject).to eq "AATCACACGTGCAGGAACCCTTTTCCAAAGGAGGGTCACGCTCACAGCTTCTGGAGTAGGACATGGACTTGTCTTTTTGGAGGCCCATCCTCAACGCACCACAGTTGACTACATCAAGGTCTGCCTCTGATCTGGTGGGAGTGCTGGGTGGTCTGTCTCCACCAGCACTTTGTGGGTGGGCTCTGTCCCCAGGAAATGCT"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fasta_read
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrea D'Amico
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: methadone
|