gene-matcher 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 829cbe42a19bf6eb809358fe514714377cc7a8a856b1d2a21070760962c60d02
4
+ data.tar.gz: 04545fb06b196d235531d125575e246e268c7e526f9684db5fb37079b66b9844
5
+ SHA512:
6
+ metadata.gz: '0854fc5cac30e888a57dda0fbda709145fe206be6acea8a29493e793075e14633a33d5ee4fa57b6b84ea4e3baa660f7a869fc14a1d51a22fc930f16f377ed2ac'
7
+ data.tar.gz: 23e94a7632e246589b347ad9c87445780046b3d2bf1e44ae94691ea065fe22141a7d3cb12f3b1d05b855c9f2c76682577aeeef92bad2e6ccabad6b135ef00285
data/lib/alignment.rb ADDED
@@ -0,0 +1,62 @@
1
+ class Alignment
2
+ attr_accessor :score, :alignmentI, :alignmentJ, :startI, :startJ, :endI, :endJ, :reversed, :aside, :source
3
+
4
+ BLANK = "-"
5
+
6
+ def initialize
7
+ # スコア
8
+ @score = 0
9
+ # 検索対象(データベースに入っていた)配列
10
+ @alignmentI = ""
11
+ # 検索した(クエリ=入力された)配列
12
+ @alignmentJ = ""
13
+ # start (元の配列中の)アライメント開始位置
14
+ # end(元の配列中の)アライメント終了位置
15
+ @startI = 0
16
+ @startJ = 0
17
+ endI = 0
18
+ endJ = 0
19
+ # 前後の反転
20
+ @reversed = false
21
+ # 逆の鎖
22
+ @aside = false
23
+ # アライメント対象配列の取得先。egtcの場合、クローンテーブルまたはアクセッションテーブル
24
+ @source = ""
25
+ end
26
+
27
+ # 二つの配列の一致部分と不一致部分を表した文字列を返す。
28
+ # ex. AGTCAAAAAAAAA- :...:::::::::. AT-TAAAAAAAAAG
29
+ #
30
+ # 戻り値 :と.からなる文字列
31
+ def alignment
32
+ len = @alignmentI.length
33
+ len = @alignmentJ.length if @alignmentJ.length < len
34
+ buf = ""
35
+ len.times do |i|
36
+ ii = @alignmentI[i]
37
+ jj = @alignmentJ[i]
38
+ if ii == jj
39
+ buf += ":"
40
+ else
41
+ buf += "."
42
+ end
43
+ end
44
+ return buf
45
+ end
46
+
47
+ # 二つの配列の一致部分の長さを返す
48
+ def alignment_count
49
+ count = 0
50
+ alignment.each_char do |c|
51
+ count += 1 if c == ":"
52
+ end
53
+ return count
54
+ end
55
+
56
+ def number_of_blank
57
+ end
58
+
59
+ def to_s
60
+ "score=#{@score} I=" + @alignmentI + " J=" + @alignmentJ;
61
+ end
62
+ end
@@ -0,0 +1,20 @@
1
+
2
+ require_relative 'smith-waterman'
3
+
4
+ class Matcher
5
+ attr_accessor :input_sequence, :limit
6
+ attr_reader :alignments
7
+
8
+ def initialize(input_sequence, limit = 0.6)
9
+ @limit = limit
10
+ @input_sequence = input_sequence
11
+ @alignments = []
12
+ end
13
+
14
+ def scan(target_sequence)
15
+ sw = SmithWaterman.instance
16
+ a = sw.alignment(target_sequence, @input_sequence)
17
+ @alignments += [a] if a.score >= @limit
18
+ end
19
+ end
20
+
@@ -0,0 +1,119 @@
1
+ # Calculate the similarity of two sequences.
2
+ # Originally created in Java language on 2004/08/04
3
+ # Migrated to Ruby on 2023/07/10
4
+ require_relative 'alignment'
5
+ require 'singleton'
6
+
7
+ class SmithWaterman
8
+ include Singleton
9
+
10
+ def alignment(target,input)
11
+ alignments = [] * 2
12
+ alignments[0] = alignment_local(target,input)
13
+ alignments[1] = alignment_local(target,aside_sequence(input))
14
+ alignments[1].aside = true
15
+
16
+ # TODO: wholeLengthI, wholeLengthJを設定する
17
+ # a[1].startJ = a[1].wholeLengthJ -a[1].startJ -1;
18
+ # a[1].endJ = a[1].wholeLengthJ -a[1].endJ-1;
19
+ return max(alignments)
20
+ end
21
+
22
+ def alignment_local(target,input)
23
+ raise "input is nil or empty" if input.nil? || input.empty?
24
+ raise "target is nil or empty" if target.nil? || target.empty?
25
+
26
+ # 行列の初期化
27
+
28
+ matrix = Array.new(target.length) { Array.new(input.length,0) }
29
+ maxScore = 0; maxI = 0; maxJ = 0
30
+ target.length.times do |i|
31
+ ci = target[i] # 検索対象文字
32
+ input.length.times do |j|
33
+ cj = input[j] # 検索文字
34
+ candidates = [0] * 4
35
+ candidates[0] = 0 # 未使用(常に0)
36
+ if i > 0 && j > 0
37
+ candidates[1] = matrix[i-1][j-1] + s(ci,cj)
38
+ else
39
+ candidates[1] = s(ci,cj)
40
+ end
41
+
42
+ if i > 0
43
+ candidates[2] = matrix[i-1][j] - 1
44
+ end
45
+ if j > 0
46
+ candidates[3] = matrix[i][j-1] - 1
47
+ end
48
+ matrix[i][j] = candidates.max
49
+ # スコアの最大点を記憶
50
+ if matrix[i][j] >= maxScore
51
+ maxScore = matrix[i][j]
52
+ maxI = i
53
+ maxJ = j
54
+ end
55
+ end
56
+ puts ci+" "+matrix[i].join(" ") if ENV["DEBUG"]
57
+ end
58
+ puts "maxScore=#{maxScore} maxI=#{maxI} maxJ=#{maxJ}" if ENV["DEBUG"]
59
+
60
+ a = Alignment.new
61
+ a.endI = maxI
62
+ a.endJ = maxJ
63
+ i = maxI; j = maxJ; bufI = target[i]; bufJ = input[j]
64
+ while i > 0 && j > 0 do
65
+ dst = [] * 3
66
+ dst[0] = matrix[i-1][j-1]
67
+ dst[1] = matrix[i-1][j] if i > 0
68
+ dst[2] = matrix[i][j-1] if j > 0
69
+ break if dst.max == 0 # 行き先がなければ終了
70
+ case dst.index(dst.max)
71
+ when 0
72
+ i -= 1
73
+ j -= 1
74
+ bufI += target[i]
75
+ bufJ += input[j]
76
+ when 1
77
+ i -= 1
78
+ bufI += target[i]
79
+ bufJ += Alignment::BLANK
80
+ when 2
81
+ j -= 1
82
+ bufI += Alignment::BLANK
83
+ bufJ += input[j]
84
+ end
85
+ end
86
+ a.alignmentI = bufI.reverse
87
+ a.alignmentJ = bufJ.reverse
88
+ a.startI = i
89
+ a.startJ = j
90
+
91
+ if a.alignmentI.length <= 20
92
+ a.score = 0
93
+ else
94
+ a.score = a.alignment_count ** 2 / a.alignmentI.length.to_f
95
+ end
96
+ return a
97
+ end
98
+
99
+ private
100
+ # 与えられたアライメント配列のうちスコアが最大のものを返す。
101
+ def max(alignments)
102
+ max_alignment = alignments[0]
103
+ alignments.each do |a|
104
+ if a.score > max_alignment.score
105
+ max_alignment = a
106
+ end
107
+ end
108
+ max_alignment
109
+ end
110
+
111
+ # スコアリング関数。一致したら1、そうでなければ0を返す。
112
+ def s(a,b)
113
+ return a == b ? 1 : 0
114
+ end
115
+
116
+ def aside_sequence(seq)
117
+ seq.reverse.tr("AGTC", "TCAG")
118
+ end
119
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gene-matcher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - ITO Yosei
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-08-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Algorithm for determining similar regions between nucleic acid sequences.
14
+ email: y-itou@lumber-mill.co.jp
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/alignment.rb
20
+ - lib/gene-matcher.rb
21
+ - lib/smith-waterman.rb
22
+ homepage: https://github.com/lumbermill/gene-matcher
23
+ licenses:
24
+ - MIT
25
+ metadata: {}
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubygems_version: 3.3.26
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: Algorithm for determining similar regions between nucleic acid sequences.
45
+ test_files: []