gfa 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc98fa1d3479be3fee554dab64876d6cd42795aec08a6b60be6ccb148c57c679
4
- data.tar.gz: 28ef77ca42b374e58c983f474d44801060dadb2df912230e431cf40ee8f1386c
3
+ metadata.gz: 8069525ab289a4299c4d51df9a58a083df95292cd7bc59b83516ac4c5e7508f2
4
+ data.tar.gz: c1398a889d1f49431bfa01b196084ee9c448c60426fb179801151e7731538c03
5
5
  SHA512:
6
- metadata.gz: 1a7226dbe5813cc9e7b0cde0e756b0b41395d45e690fd3df0e6ad266f51a209d55d72fdaeaab5aaebd3fec7ce033b54bb04510903cc3eadef230bd30485f037e
7
- data.tar.gz: 32f081a9cf35219f05cb21654011a32a1645f13cbf1d7dda320ebef0b7d893523943cd71f8c9f8a310f1021b6adfaa99b1f863d498be1bbb77bf0a5e04779efd
6
+ metadata.gz: d97f574a6081727f5687e259a954881351bf7767ec36d1f5ac8b2e6eef939a59a9406077619b306d9782e99357ec2cb98536b4e73474a9aad75c61614c7f7d13
7
+ data.tar.gz: 385a463ad705401ec083375b917d3654633f44e3828fcc93546422fb80712fc4bc22dbc2dd128dbdf1db44a8c60ed4d7a31df558db8786eb1fca12225a29b2ab
data/bin/gfa-add-gaf CHANGED
@@ -18,11 +18,13 @@ unless degree
18
18
  <input-gaf> Input GAF file to read
19
19
  <output> Output GFA file to write
20
20
  <degree> Maximum degree of separation between the segment set in the GAF
21
- and any other included segments. If 0, only segments are
22
- included. If 1, only the target segments, records linking to
23
- them, and segments linked by those records. Any integer > 1
24
- includes additional expansion rounds for those linked segments.
25
- Use -1 to include the complete original GAF without subsetting.
21
+ and any other segments included in the output GFA.
22
+ - -1: include the complete original GAF without subsetting.
23
+ - 0: only segments in the GAF are included.
24
+ - 1: only the target segments in the GAF, records linking to
25
+ them, and segments linked by those records are included.
26
+ - Any integer > 1: include additional expansion rounds for
27
+ those linked segments.
26
28
  <pref> A prefix to name all recorded paths
27
29
  By default: Based on the GAF file name
28
30
  <threads> If passed, parallelize process with these many threads
@@ -26,4 +26,11 @@ end
26
26
  $stderr.puts "Loading GFA: #{input}"
27
27
  gfa = GFA.load_parallel(input, (threads || 1).to_i)
28
28
 
29
+ $stderr.puts "Saving path sequences: #{output}"
30
+ File.open(output, 'w') do |fasta|
31
+ gfa.paths.set.each do |path|
32
+ fasta.puts '>%s' % path.path_name.value
33
+ fasta.puts path.sequence(gfa)
34
+ end
35
+ end
29
36
 
data/lib/gfa/graph.rb CHANGED
@@ -146,7 +146,7 @@ class GFA
146
146
 
147
147
  def internally_linking_records(segments, edges)
148
148
  $stderr.puts '- Gathering internally linking records'
149
- segments = Hash[segments.set.map { |i| [i.name.value, true]}]
149
+ segments = Hash[segments.map { |i| [i.name.value, true]}]
150
150
  edges.select { |record| record.segment_names_a.all? { |s| segments[s] } }
151
151
  end
152
152
 
@@ -42,4 +42,59 @@ class GFA::Record::Path < GFA::Record
42
42
 
43
43
  segment_names_a.any? { |name| segment.name == name }
44
44
  end
45
+
46
+ ##
47
+ # Array of GFA::Field::String with the sequences from each segment featuring
48
+ # the correct orientation from a +gfa+ (which *must* be indexed)
49
+ #
50
+ # TODO: Distinguish between a direct path (separated by comma) and a
51
+ # jump (separated by semicolon). Jumps include a distance estimate
52
+ # (column 6, optional) which could be used to add Ns between segment
53
+ # sequences (from GFA 1.2)
54
+ def segment_sequences(gfa)
55
+ raise "Unindexed GFA" unless gfa.indexed?
56
+ segment_names.value.split(/[,;]/).map do |i|
57
+ orientation = i[-1]
58
+ i[-1] = ''
59
+ segment = gfa.segments[i]
60
+
61
+ case orientation
62
+ when '+' ; segment.sequence
63
+ when '-' ; segment.rc
64
+ else ; raise "Unknown orientation: #{orientation} (path: #{path_name})"
65
+ end
66
+ end
67
+ end
68
+
69
+ ##
70
+ # Produce the contiguous path sequence based on the segment sequences and
71
+ # orientations from a +gfa+ (which *must* be indexed)
72
+ #
73
+ # TODO: Estimate gaps (Ns) from Jump distances (see +segment_sequences+)
74
+ #
75
+ # TODO: Attempt reading CIGAR values from the path first, the corresponding
76
+ # links next, and actually performing the pairwise overlap as last resort
77
+ #
78
+ # TODO: Support ambiguous IUPAC codes for overlap evaluation
79
+ def sequence(gfa)
80
+ segment_sequences(gfa).map(&:value)
81
+ .inject('') { |a, b| a + after_overlap(a, b) }
82
+ end
83
+
84
+ private
85
+ ##
86
+ # Find the overlap between sequences +a+ and +b+ (Strings) and return
87
+ # only the part of +b+ after the overlap. Assumes that +a+ starts
88
+ # at the same point or before +b+. If no overlap is found, returns +b+
89
+ # in its entirety.
90
+ def after_overlap(a, b)
91
+ (0 .. a.length - 1).each do |a_from|
92
+ a_to = b.length + a_from > a.length ? a.length : b.length + a_from
93
+ b_to = b.length + a_from > a.length ? a.length - a_from : b.length
94
+ if a[a_from .. a_to - 1] == b[0 .. b_to - 1]
95
+ return b[b_to .. b.length].to_s
96
+ end
97
+ end
98
+ b
99
+ end
45
100
  end
@@ -32,4 +32,12 @@ class GFA::Record::Segment < GFA::Record
32
32
  def length
33
33
  sequence.value.length
34
34
  end
35
+
36
+ ##
37
+ # Returns the reverse-complement of the sequence (as a Z field)
38
+ def rc
39
+ GFA::Field::String.new(
40
+ sequence.value.upcase.reverse.tr('ACGTURYSWKMBDHVN', 'TGCAAYRSWMKVHDBN')
41
+ )
42
+ end
35
43
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.6.0'
2
+ VERSION = '0.6.2'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-05 00:00:00.000000000 Z
11
+ date: 2023-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl