gfa 0.6.0 → 0.6.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc98fa1d3479be3fee554dab64876d6cd42795aec08a6b60be6ccb148c57c679
4
- data.tar.gz: 28ef77ca42b374e58c983f474d44801060dadb2df912230e431cf40ee8f1386c
3
+ metadata.gz: 8069525ab289a4299c4d51df9a58a083df95292cd7bc59b83516ac4c5e7508f2
4
+ data.tar.gz: c1398a889d1f49431bfa01b196084ee9c448c60426fb179801151e7731538c03
5
5
  SHA512:
6
- metadata.gz: 1a7226dbe5813cc9e7b0cde0e756b0b41395d45e690fd3df0e6ad266f51a209d55d72fdaeaab5aaebd3fec7ce033b54bb04510903cc3eadef230bd30485f037e
7
- data.tar.gz: 32f081a9cf35219f05cb21654011a32a1645f13cbf1d7dda320ebef0b7d893523943cd71f8c9f8a310f1021b6adfaa99b1f863d498be1bbb77bf0a5e04779efd
6
+ metadata.gz: d97f574a6081727f5687e259a954881351bf7767ec36d1f5ac8b2e6eef939a59a9406077619b306d9782e99357ec2cb98536b4e73474a9aad75c61614c7f7d13
7
+ data.tar.gz: 385a463ad705401ec083375b917d3654633f44e3828fcc93546422fb80712fc4bc22dbc2dd128dbdf1db44a8c60ed4d7a31df558db8786eb1fca12225a29b2ab
data/bin/gfa-add-gaf CHANGED
@@ -18,11 +18,13 @@ unless degree
18
18
  <input-gaf> Input GAF file to read
19
19
  <output> Output GFA file to write
20
20
  <degree> Maximum degree of separation between the segment set in the GAF
21
- and any other included segments. If 0, only segments are
22
- included. If 1, only the target segments, records linking to
23
- them, and segments linked by those records. Any integer > 1
24
- includes additional expansion rounds for those linked segments.
25
- Use -1 to include the complete original GAF without subsetting.
21
+ and any other segments included in the output GFA.
22
+ - -1: include the complete original GAF without subsetting.
23
+ - 0: only segments in the GAF are included.
24
+ - 1: only the target segments in the GAF, records linking to
25
+ them, and segments linked by those records are included.
26
+ - Any integer > 1: include additional expansion rounds for
27
+ those linked segments.
26
28
  <pref> A prefix to name all recorded paths
27
29
  By default: Based on the GAF file name
28
30
  <threads> If passed, parallelize process with these many threads
@@ -26,4 +26,11 @@ end
26
26
  $stderr.puts "Loading GFA: #{input}"
27
27
  gfa = GFA.load_parallel(input, (threads || 1).to_i)
28
28
 
29
+ $stderr.puts "Saving path sequences: #{output}"
30
+ File.open(output, 'w') do |fasta|
31
+ gfa.paths.set.each do |path|
32
+ fasta.puts '>%s' % path.path_name.value
33
+ fasta.puts path.sequence(gfa)
34
+ end
35
+ end
29
36
 
data/lib/gfa/graph.rb CHANGED
@@ -146,7 +146,7 @@ class GFA
146
146
 
147
147
  def internally_linking_records(segments, edges)
148
148
  $stderr.puts '- Gathering internally linking records'
149
- segments = Hash[segments.set.map { |i| [i.name.value, true]}]
149
+ segments = Hash[segments.map { |i| [i.name.value, true]}]
150
150
  edges.select { |record| record.segment_names_a.all? { |s| segments[s] } }
151
151
  end
152
152
 
@@ -42,4 +42,59 @@ class GFA::Record::Path < GFA::Record
42
42
 
43
43
  segment_names_a.any? { |name| segment.name == name }
44
44
  end
45
+
46
+ ##
47
+ # Array of GFA::Field::String with the sequences from each segment featuring
48
+ # the correct orientation from a +gfa+ (which *must* be indexed)
49
+ #
50
+ # TODO: Distinguish between a direct path (separated by comma) and a
51
+ # jump (separated by semicolon). Jumps include a distance estimate
52
+ # (column 6, optional) which could be used to add Ns between segment
53
+ # sequences (from GFA 1.2)
54
+ def segment_sequences(gfa)
55
+ raise "Unindexed GFA" unless gfa.indexed?
56
+ segment_names.value.split(/[,;]/).map do |i|
57
+ orientation = i[-1]
58
+ i[-1] = ''
59
+ segment = gfa.segments[i]
60
+
61
+ case orientation
62
+ when '+' ; segment.sequence
63
+ when '-' ; segment.rc
64
+ else ; raise "Unknown orientation: #{orientation} (path: #{path_name})"
65
+ end
66
+ end
67
+ end
68
+
69
+ ##
70
+ # Produce the contiguous path sequence based on the segment sequences and
71
+ # orientations from a +gfa+ (which *must* be indexed)
72
+ #
73
+ # TODO: Estimate gaps (Ns) from Jump distances (see +segment_sequences+)
74
+ #
75
+ # TODO: Attempt reading CIGAR values from the path first, the corresponding
76
+ # links next, and actually performing the pairwise overlap as last resort
77
+ #
78
+ # TODO: Support ambiguous IUPAC codes for overlap evaluation
79
+ def sequence(gfa)
80
+ segment_sequences(gfa).map(&:value)
81
+ .inject('') { |a, b| a + after_overlap(a, b) }
82
+ end
83
+
84
+ private
85
+ ##
86
+ # Find the overlap between sequences +a+ and +b+ (Strings) and return
87
+ # only the part of +b+ after the overlap. Assumes that +a+ starts
88
+ # at the same point or before +b+. If no overlap is found, returns +b+
89
+ # in its entirety.
90
+ def after_overlap(a, b)
91
+ (0 .. a.length - 1).each do |a_from|
92
+ a_to = b.length + a_from > a.length ? a.length : b.length + a_from
93
+ b_to = b.length + a_from > a.length ? a.length - a_from : b.length
94
+ if a[a_from .. a_to - 1] == b[0 .. b_to - 1]
95
+ return b[b_to .. b.length].to_s
96
+ end
97
+ end
98
+ b
99
+ end
45
100
  end
@@ -32,4 +32,12 @@ class GFA::Record::Segment < GFA::Record
32
32
  def length
33
33
  sequence.value.length
34
34
  end
35
+
36
+ ##
37
+ # Returns the reverse-complement of the sequence (as a Z field)
38
+ def rc
39
+ GFA::Field::String.new(
40
+ sequence.value.upcase.reverse.tr('ACGTURYSWKMBDHVN', 'TGCAAYRSWMKVHDBN')
41
+ )
42
+ end
35
43
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.6.0'
2
+ VERSION = '0.6.2'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-05 00:00:00.000000000 Z
11
+ date: 2023-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl