bio-maf 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/bio/maf/tiler.rb CHANGED
@@ -13,16 +13,44 @@ module Bio::MAF
13
13
  attr_reader :reference
14
14
  # GenomicInterval
15
15
  attr_accessor :interval
16
+
17
+ # The species of interest to extract from the MAF file. Will be
18
+ # set as a {Parser#sequence_filter} for parsing. Defaults to the
19
+ # keys of {#species_map}.
20
+ #
21
+ # @return [Array<String>]
16
22
  attr_accessor :species
23
+
24
+ # A hash mapping species to their desired output names.
25
+ #
26
+ # @return [Hash]
17
27
  attr_accessor :species_map
18
28
 
29
+ # The character used to fill regions where no sequence data is available for a particular species. Defaults to `*`.
30
+ # @return [String]
31
+ attr_reader :fill_char
32
+
19
33
  def initialize
20
34
  @species_map = {}
35
+ self.fill_char = '*'
36
+ end
37
+
38
+ # Set the character to be used for filling regions with no
39
+ # sequence data from the MAF file or a reference sequence.
40
+ # @param c [String] a one-character String to fill with
41
+ def fill_char=(c)
42
+ unless c.is_a?(String) && c.length == 1
43
+ raise ArgumentError, "not a single character: #{c.inspect}"
44
+ end
45
+ @fill_char = c
21
46
  end
22
47
 
23
- # Set the reference sequence.
48
+ # Set the reference sequence. This can be a {Pathname} or a
49
+ # {String} giving the path to an optionally-gzipped FASTA file, an
50
+ # open {IO} stream to a FASTA file, a String containing FASTA
51
+ # data, or a {FASTARangeReader} instance.
24
52
  #
25
- # @param source [FASTARangeReader, String, Pathname]
53
+ # @param source [FASTARangeReader, String, Pathname, #readline]
26
54
  def reference=(source)
27
55
  ref = case
28
56
  when source.is_a?(FASTARangeReader)
@@ -57,6 +85,13 @@ module Bio::MAF
57
85
  species || species_map.keys
58
86
  end
59
87
 
88
+ def species_for_output
89
+ species_to_use.collect { |s| species_map[s] || s }
90
+ end
91
+
92
+ # Return an array of tiled sequence data, in the order given by
93
+ # {#species_to_use}.
94
+ # @return [Array<String>]
60
95
  def tile
61
96
  parser.sequence_filter[:only_species] = species_to_use
62
97
  # TODO: remove gaps
@@ -88,8 +123,8 @@ module Bio::MAF
88
123
  else
89
124
  'N' * range_size
90
125
  end
91
- stars = '*' * range_size
92
- nonref_text.each { |t| t << stars }
126
+ fill_text = fill_char * range_size
127
+ nonref_text.each { |t| t << fill_text }
93
128
  else
94
129
  # covered by an alignment block
95
130
  t_range = block.ref_seq.text_range(g_range)
@@ -100,8 +135,8 @@ module Bio::MAF
100
135
  # got alignment text
101
136
  sp_text << seq.text.slice(t_range)
102
137
  else
103
- # no alignment for this one here, use '*'
104
- sp_text << '*' * (t_range.end - t_range.begin)
138
+ # no alignment for this one here, use the fill char
139
+ sp_text << fill_char * (t_range.end - t_range.begin)
105
140
  end
106
141
  end
107
142
  end
@@ -109,9 +144,26 @@ module Bio::MAF
109
144
  text
110
145
  end
111
146
 
147
+ # Tile sequences to build a new {Bio::BioAlignment::Alignment
148
+ # Alignment} object. This will have one
149
+ # {Bio::BioAlignment::Sequence Sequence} per entry in {#species}
150
+ # or {#species_map}, in the same order. Each sequence will have an
151
+ # {Bio::BioAlignment::Sequence#id id} given by {#species_map} or,
152
+ # if none is present, the identifier from {#species}.
153
+ #
154
+ # @return [Bio::BioAlignment::Alignment]
155
+ # @api public
156
+ def build_bio_alignment
157
+ Bio::BioAlignment::Alignment.new(tile(), species_for_output)
158
+ end
159
+
160
+ # Write a FASTA representation of the tiled sequences to the given
161
+ # output stream.
162
+ #
163
+ # @param [#puts] f the output stream to write the FASTA data to.
164
+ # @api public
112
165
  def write_fasta(f)
113
- species_to_use.zip(tile()) do |species, text|
114
- sp_out = species_map[species] || species
166
+ species_for_output.zip(tile()) do |sp_out, text|
115
167
  f.puts ">#{sp_out}"
116
168
  f.puts text
117
169
  end
@@ -59,5 +59,31 @@ module Bio::MAF
59
59
  end
60
60
  end
61
61
  end
62
+
63
+ FASTA_LINE_LEN = 72
64
+
65
+ class FASTAWriter
66
+
67
+ def initialize(outf)
68
+ @f = outf
69
+ end
70
+
71
+ def write_block(block)
72
+ block.sequences.each do |seq|
73
+ write_sequence(seq) unless seq.empty?
74
+ end
75
+ end
76
+
77
+ def write_sequence(seq)
78
+ @f.puts(">#{seq.fasta_desc}")
79
+ 0.step(seq.text.size, FASTA_LINE_LEN) do |pos|
80
+ @f.puts(seq.text.slice(pos, FASTA_LINE_LEN))
81
+ end
82
+ end
83
+
84
+ def close
85
+ @f.close
86
+ end
87
+ end
62
88
 
63
89
  end
data/man/maf_extract.1 ADDED
@@ -0,0 +1,159 @@
1
+ .\" generated with Ronn/v0.7.3
2
+ .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
+ .
4
+ .TH "MAF_EXTRACT" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
+ .
6
+ .SH "NAME"
7
+ \fBmaf_extract\fR \- extract blocks from MAF files
8
+ .
9
+ .SH "SYNOPSIS"
10
+ \fBmaf_extract\fR \-m MAF [\-i INDEX] \-\-interval SEQ:START:END \fIOPTIONS\fR
11
+ .
12
+ .P
13
+ \fBmaf_extract\fR \-m MAF [\-i INDEX] \-\-bed BED \fIOPTIONS\fR
14
+ .
15
+ .P
16
+ \fBmaf_extract\fR \-d MAFDIR \-\-interval SEQ:START:END \fIOPTIONS\fR
17
+ .
18
+ .P
19
+ \fBmaf_extract\fR \-d MAFDIR \-\-bed BED \fIOPTIONS\fR
20
+ .
21
+ .SH "DESCRIPTION"
22
+ \fBmaf_extract\fR extracts alignment blocks from one or more indexed MAF files, according to either a genomic interval specified with \fB\-\-interval\fR or multiple intervals given in a BED file specified with \fB\-\-bed\fR\.
23
+ .
24
+ .P
25
+ It can either match blocks intersecting the specified intervals with \fB\-\-mode intersect\fR, the default, or extract slices of them which cover only the specified intervals, with \fB\-\-mode slice\fR\.
26
+ .
27
+ .P
28
+ Blocks and the sequences they contain can be filtered with a variety of options including \fB\-\-only\-species\fR, \fB\-\-with\-all\-species\fR, \fB\-\-min\-sequences\fR, \fB\-\-min\-text\-size\fR, and \fB\-\-max\-text\-size\fR\.
29
+ .
30
+ .P
31
+ With the \fB\-\-join\-blocks\fR option, adjacent parsed blocks can be joined if sequence filtering has removed a species causing them to be separated\. The \fB\-\-remove\-gaps\fR option will remove columns containing only gaps (\fB\-\fR)\.
32
+ .
33
+ .P
34
+ Blocks can be output in MAF format, with \fB\-\-format maf\fR (the default), or FASTA format, with \fB\-\-format fasta\fR\. Output can be directed to a file with \fB\-\-output\fR\.
35
+ .
36
+ .P
37
+ This tool exposes almost all the random\-access functionality of the Bio::MAF::Access class\. The exception is MAF tiling, which is provided by maf_tile(1)\.
38
+ .
39
+ .SH "FILES"
40
+ A single MAF file can be processed by specifying it with \fB\-\-maf\fR\. Its accompanying index, created by maf_index(1), is specified with \fB\-\-index\fR\. If \fB\-\-maf\fR is given but no index is specified, the entire file will be parsed to build a temporary in\-memory index\. This facilitates processing small, transient MAF files\. However, on a large file this will incur a great deal of overhead; files expected to be used more than once should be indexed with maf_index(1)\.
41
+ .
42
+ .P
43
+ Alternatively, a directory of indexed MAF files can be specified with \fB\-\-maf\-dir\fR; in this case, they will all be used to satisfy queries\.
44
+ .
45
+ .SH "OPTIONS"
46
+ MAF source options:
47
+ .
48
+ .TP
49
+ \fB\-m\fR, \fB\-\-maf MAF\fR
50
+ A single MAF file to process\.
51
+ .
52
+ .TP
53
+ \fB\-i\fR, \fB\-\-index INDEX\fR
54
+ An index for the file specified with \fB\-\-maf\fR, as created by maf_index(1)\.
55
+ .
56
+ .TP
57
+ \fB\-d\fR, \fB\-\-maf\-dir DIR\fR
58
+ A directory of indexed MAF files\.
59
+ .
60
+ .P
61
+ Extraction options:
62
+ .
63
+ .TP
64
+ \fB\-\-mode (intersect | slice)\fR
65
+ The extraction mode to use\. With \fB\-\-mode intersect\fR, any alignment block intersecting the genomic intervals specified will be matched in its entirety\. With \fB\-\-mode slice\fR, intersecting blocks will be matched in the same way, but columns extending outside the specified interval will be removed\.
66
+ .
67
+ .TP
68
+ \fB\-\-bed BED\fR
69
+ The specified file will be parsed as a BED file, and each interval it contains will be matched in turn\.
70
+ .
71
+ .TP
72
+ \fB\-\-interval SEQ:START:END\fR
73
+ A single zero\-based half\-open genomic interval will be matched, with sequence identifier \fIseq\fR, (inclusive) start position \fIstart\fR, and (exclusive) end position \fIend\fR\.
74
+ .
75
+ .P
76
+ Output options:
77
+ .
78
+ .TP
79
+ \fB\-f\fR, \fB\-\-format (maf | fasta)\fR
80
+ Output will be written in the specified format, either MAF or FASTA\.
81
+ .
82
+ .TP
83
+ \fB\-o\fR, \fB\-\-output OUT\fR
84
+ Output will be written to the file \fIout\fR\.
85
+ .
86
+ .P
87
+ Filtering options:
88
+ .
89
+ .TP
90
+ \fB\-\-only\-species (SP1,SP2,SP3 | @FILE)\fR
91
+ Alignment blocks will be filtered to contain only the specified species\. These can be given as a comma\-separated list or as a file, prefixed with \fB@\fR, from which a list of species will be read\.
92
+ .
93
+ .TP
94
+ \fB\-\-with\-all\-species (SP1,SP2,SP3 | @FILE)\fR
95
+ Only alignment blocks containing all the specified species will be matched\. These can be given as a comma\-separated list or as a file, prefixed with \fB@\fR, from which a list of species will be read\.
96
+ .
97
+ .TP
98
+ \fB\-\-min\-sequences N\fR
99
+ Only alignment blocks containing at least \fIn\fR sequences will be matched\.
100
+ .
101
+ .TP
102
+ \fB\-\-min\-text\-size N\fR
103
+ Only alignment blocks with a text size (including gaps) of at least \fIn\fR will be matched\.
104
+ .
105
+ .TP
106
+ \fB\-\-max\-text\-size N\fR
107
+ Only alignment blocks with a text size (including gaps) of at most \fIn\fR will be matched\.
108
+ .
109
+ .P
110
+ Block processing options:
111
+ .
112
+ .TP
113
+ \fB\-\-join\-blocks\fR
114
+ If sequence filtering with \fB\-\-only\-species\fR removes a species which caused two adjacent blocks to be separate, this option will join them together into a single alignment block\. The filtered blocks must contain the same sequences in contiguous positions and on the same strand\.
115
+ .
116
+ .TP
117
+ \fB\-\-remove\-gaps\fR
118
+ If sequence filtering with \fB\-\-only\-species\fR leaves a block containing columns consisting only of gap characters (\fB\-\fR), these will be removed\.
119
+ .
120
+ .TP
121
+ \fB\-\-parse\-extended\fR
122
+ Parse \fBi\fR lines, giving information on the context of sequence lines, and \fBq\fR lines, giving quality scores\.
123
+ .
124
+ .TP
125
+ \fB\-\-parse\-empty\fR
126
+ Parse \fBe\fR lines, indicating cases where a species does not align with the current block but does align with blocks before and after it\.
127
+ .
128
+ .P
129
+ Logging options:
130
+ .
131
+ .TP
132
+ \fB\-q\fR, \fB\-\-quiet\fR
133
+ Run quietly, with warnings suppressed\.
134
+ .
135
+ .TP
136
+ \fB\-v\fR, \fB\-\-verbose\fR
137
+ Run verbosely, with additional informational messages\.
138
+ .
139
+ .TP
140
+ \fB\-\-debug\fR
141
+ Log debugging information\.
142
+ .
143
+ .SH "EXAMPLES"
144
+ TODO
145
+ .
146
+ .SH "ENVIRONMENT"
147
+ \fBmaf_index\fR is a Ruby program and relies on ordinary Ruby environment variables\.
148
+ .
149
+ .SH "BUGS"
150
+ No provision exists for writing output to multiple files\.
151
+ .
152
+ .P
153
+ FASTA description lines are always in the format \fB>source:start\-end\fR\.
154
+ .
155
+ .SH "COPYRIGHT"
156
+ \fBmaf_index\fR is copyright (C) 2012 Clayton Wheeler\.
157
+ .
158
+ .SH "SEE ALSO"
159
+ ruby(1), maf_index(1), maf_tile(1)
@@ -0,0 +1,175 @@
1
+ maf_extract(1) -- extract blocks from MAF files
2
+ ===============================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_extract` -m MAF [-i INDEX] --interval SEQ:START:END [OPTIONS]
7
+
8
+ `maf_extract` -m MAF [-i INDEX] --bed BED [OPTIONS]
9
+
10
+ `maf_extract` -d MAFDIR --interval SEQ:START:END [OPTIONS]
11
+
12
+ `maf_extract` -d MAFDIR --bed BED [OPTIONS]
13
+
14
+ ## DESCRIPTION
15
+
16
+ **maf_extract** extracts alignment blocks from one or more indexed MAF
17
+ files, according to either a genomic interval specified with
18
+ `--interval` or multiple intervals given in a BED file specified with
19
+ `--bed`.
20
+
21
+ It can either match blocks intersecting the specified intervals with
22
+ `--mode intersect`, the default, or extract slices of them which cover
23
+ only the specified intervals, with `--mode slice`.
24
+
25
+ Blocks and the sequences they contain can be filtered with a variety
26
+ of options including `--only-species`, `--with-all-species`,
27
+ `--min-sequences`, `--min-text-size`, and `--max-text-size`.
28
+
29
+ With the `--join-blocks` option, adjacent parsed blocks can be joined if
30
+ sequence filtering has removed a species causing them to be
31
+ separated. The `--remove-gaps` option will remove columns containing
32
+ only gaps (`-`).
33
+
34
+ Blocks can be output in MAF format, with `--format maf` (the default),
35
+ or FASTA format, with `--format fasta`. Output can be directed to a
36
+ file with `--output`.
37
+
38
+ This tool exposes almost all the random-access functionality of the
39
+ Bio::MAF::Access class. The exception is MAF tiling, which is provided
40
+ by maf_tile(1).
41
+
42
+ ## FILES
43
+
44
+ A single MAF file can be processed by specifying it with `--maf`. Its
45
+ accompanying index, created by maf_index(1), is specified with
46
+ `--index`. If `--maf` is given but no index is specified, the entire
47
+ file will be parsed to build a temporary in-memory index. This
48
+ facilitates processing small, transient MAF files. However, on a large
49
+ file this will incur a great deal of overhead; files expected to be
50
+ used more than once should be indexed with maf_index(1).
51
+
52
+ Alternatively, a directory of indexed MAF files can be specified with
53
+ `--maf-dir`; in this case, they will all be used to satisfy queries.
54
+
55
+ ## OPTIONS
56
+
57
+ MAF source options:
58
+
59
+ * `-m`, `--maf MAF`:
60
+ A single MAF file to process.
61
+
62
+ * `-i`, `--index INDEX`:
63
+ An index for the file specified with `--maf`, as created by
64
+ maf_index(1).
65
+
66
+ * `-d`, `--maf-dir DIR`:
67
+ A directory of indexed MAF files.
68
+
69
+ Extraction options:
70
+
71
+ * `--mode (intersect | slice)`:
72
+ The extraction mode to use. With `--mode intersect`, any alignment
73
+ block intersecting the genomic intervals specified will be matched
74
+ in its entirety. With `--mode slice`, intersecting blocks will be
75
+ matched in the same way, but columns extending outside the
76
+ specified interval will be removed.
77
+
78
+ * `--bed BED`:
79
+ The specified file will be parsed as a BED file, and each interval
80
+ it contains will be matched in turn.
81
+
82
+ * `--interval SEQ:START:END`:
83
+ A single zero-based half-open genomic interval will be matched,
84
+ with sequence identifier <seq>, (inclusive) start position <start>,
85
+ and (exclusive) end position <end>.
86
+
87
+ Output options:
88
+
89
+ * `-f`, `--format (maf | fasta)`:
90
+ Output will be written in the specified format, either MAF or
91
+ FASTA.
92
+
93
+ * `-o`, `--output OUT`:
94
+ Output will be written to the file <out>.
95
+
96
+ Filtering options:
97
+
98
+ * `--only-species (SP1,SP2,SP3 | @FILE)`:
99
+ Alignment blocks will be filtered to contain only the specified
100
+ species. These can be given as a comma-separated list or as a file,
101
+ prefixed with `@`, from which a list of species will be read.
102
+
103
+ * `--with-all-species (SP1,SP2,SP3 | @FILE)`:
104
+ Only alignment blocks containing all the specified species will be
105
+ matched. These can be given as a comma-separated list or as a file,
106
+ prefixed with `@`, from which a list of species will be read.
107
+
108
+ * `--min-sequences N`:
109
+ Only alignment blocks containing at least <n> sequences will be
110
+ matched.
111
+
112
+ * `--min-text-size N`:
113
+ Only alignment blocks with a text size (including gaps) of at least
114
+ <n> will be matched.
115
+
116
+ * `--max-text-size N`:
117
+ Only alignment blocks with a text size (including gaps) of at most
118
+ <n> will be matched.
119
+
120
+ Block processing options:
121
+
122
+ * `--join-blocks`:
123
+ If sequence filtering with `--only-species` removes a species which
124
+ caused two adjacent blocks to be separate, this option will join
125
+ them together into a single alignment block. The filtered blocks
126
+ must contain the same sequences in contiguous positions and on the
127
+ same strand.
128
+
129
+ * `--remove-gaps`:
130
+ If sequence filtering with `--only-species` leaves a block
131
+ containing columns consisting only of gap characters (`-`), these
132
+ will be removed.
133
+
134
+ * `--parse-extended`:
135
+ Parse `i` lines, giving information on the context of sequence
136
+ lines, and `q` lines, giving quality scores.
137
+
138
+ * `--parse-empty`:
139
+ Parse `e` lines, indicating cases where a species does not align
140
+ with the current block but does align with blocks before and after
141
+ it.
142
+
143
+ Logging options:
144
+
145
+ * `-q`, `--quiet`:
146
+ Run quietly, with warnings suppressed.
147
+
148
+ * `-v`, `--verbose`:
149
+ Run verbosely, with additional informational messages.
150
+
151
+ * `--debug`:
152
+ Log debugging information.
153
+
154
+ ## EXAMPLES
155
+
156
+ TODO
157
+
158
+ ## ENVIRONMENT
159
+
160
+ `maf_index` is a Ruby program and relies on ordinary Ruby environment
161
+ variables.
162
+
163
+ ## BUGS
164
+
165
+ No provision exists for writing output to multiple files.
166
+
167
+ FASTA description lines are always in the format `>source:start-end`.
168
+
169
+ ## COPYRIGHT
170
+
171
+ `maf_index` is copyright (C) 2012 Clayton Wheeler.
172
+
173
+ ## SEE ALSO
174
+
175
+ ruby(1), maf_index(1), maf_tile(1)