bio-repeatmasker 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/bio-repeatmasker.rb +75 -0
- data/test/chr1.fa.out_head100 +100 -0
- data/test/test_bio-repeatmasker.rb +9 -0
- metadata +50 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
# Copyright:: Copyright (C) 2009
|
2
|
+
# Andrei Rozanski <rozanski.andrei@gmail.com>
|
3
|
+
# License:: The Ruby License
|
4
|
+
# == Description
|
5
|
+
# This file containts parser for RepeatMasker.
|
6
|
+
|
7
|
+
#Define module RptMskr
|
8
|
+
module RptMskr
|
9
|
+
|
10
|
+
|
11
|
+
class Parser
|
12
|
+
|
13
|
+
def initialize;end
|
14
|
+
|
15
|
+
#input file - raw RepeatMasker output file - http://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/chromOut.tar.gz
|
16
|
+
|
17
|
+
def self.open(filename)
|
18
|
+
$rpt_data=Hash.new{ |hash, key| hash[key] = [] }
|
19
|
+
File.open(filename).each do |line|
|
20
|
+
s_l=line.gsub(/^\s+/, "").gsub(/\s+/,"\t").split("\t")
|
21
|
+
if s_l[0] =~ /[0-9]+/ and s_l[8] !~ /C/
|
22
|
+
hash1 = {"repeat_name"=>"#{s_l[9]}","class_name"=>"#{s_l[10]}","chromosome"=>"#{s_l[4]}","start_coordinate"=>"#{s_l[5]}","end_coordinate"=>"#{s_l[6]}","strand" =>"#{s_l[8]}","n_beyond_match"=>"#{s_l[7].gsub(/\(|\)/,"")}","start_match"=>"#{s_l[11]}","end_match"=>"#{s_l[12]}","n_prior_match"=>"#{s_l[13].gsub(/\(|\)/,"")}","sw_score"=>"#{s_l[0]}","subs_percent"=>"#{s_l[1]}","del_percent"=>"#{s_l[2]}","ins_percent"=>"#{s_l[3]}"}
|
23
|
+
$rpt_data[:"#{s_l[14]}"] << hash1
|
24
|
+
end
|
25
|
+
if s_l[0] =~ /[0-9]+/ and s_l[8] =~ /C/
|
26
|
+
hash2 = {"repeat_name"=>"#{s_l[9]}","class_name"=>"#{s_l[10]}","chromosome"=>"#{s_l[4]}","start_coordinate"=>"#{s_l[5]}","end_coordinate"=>"#{s_l[6]}","strand" =>"#{s_l[8]}","n_beyond_match"=>"#{s_l[7].gsub(/\(|\)/,"")}","start_match"=>"#{s_l[13]}","end_match"=>"#{s_l[12]}","n_prior_match"=>"#{s_l[11].gsub(/\(|\)/,"")}","sw_score"=>"#{s_l[0]}","subs_percent"=>"#{s_l[1]}","del_percent"=>"#{s_l[2]}","ins_percent"=>"#{s_l[3]}"}
|
27
|
+
$rpt_data[:"#{s_l[14]}"] << hash2
|
28
|
+
end
|
29
|
+
end
|
30
|
+
#return hash of hashes - key for main hash = id
|
31
|
+
return $rpt_data
|
32
|
+
end
|
33
|
+
|
34
|
+
#search for events based on its ids - input is an array of ids
|
35
|
+
def self.search_by_id(ids)
|
36
|
+
ids.each do |id|
|
37
|
+
$rpt_data.each do |key,value|
|
38
|
+
puts "#{key},#{value}" if key.to_s == id.to_s
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
#search for events based on given fields and values - input one field and array of values (i.e, chromosome, ["chr1","chr2","chr3","chr4"]) - returns all events matching this pattern
|
44
|
+
#fiels, based on RepeatMasker output documentation (http://www.repeatmasker.org/) are : id, repeat_name, class_name, chromosome, start_coordinate, end_coordinate, strand, n_beyond_match, start_match, end_match, n_prior_match, sw_score, subs_percent, del_percent, ins_percent
|
45
|
+
|
46
|
+
def self.search_by_field(field,array_of_values)
|
47
|
+
result=[]
|
48
|
+
array_of_values.each do |search_vals|
|
49
|
+
$rpt_data.each do |key,value|
|
50
|
+
value.each do |ab|
|
51
|
+
if ab["#{field}"] == search_vals.to_s
|
52
|
+
res="#{key},#{value}"
|
53
|
+
result.push(res)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
return result.uniq
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#search for events that are located into a coordinate interval - chromosome 1, from 10000 to 20000 - chr1,10000,20000)
|
62
|
+
def self.search_by_coordinate_interval(chr,start_c,end_c)
|
63
|
+
result=[]
|
64
|
+
$rpt_data.each do |key,value|
|
65
|
+
value.each do |ab|
|
66
|
+
if ab["chromosome"] == chr.to_s and ab["start_coordinate"] >= start_c.to_s and ab["end_coordinate"] <= end_c.to_s and ab["start_coordinate"] < end_c.to_s and ab["end_coordinate"] > start_c.to_s
|
67
|
+
res="#{key},#{value}"
|
68
|
+
result.push(res)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
return result.uniq
|
73
|
+
end
|
74
|
+
end #Parser
|
75
|
+
end #RptMskr
|
@@ -0,0 +1,100 @@
|
|
1
|
+
SW perc perc perc query position in query matching repeat position in repeat
|
2
|
+
score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID
|
3
|
+
|
4
|
+
1504 1.3 0.4 1.3 chr1 10001 10468 (249240153) + (CCCTAA)n Simple_repeat 1 463 (0) 1
|
5
|
+
3612 11.4 27.0 1.3 chr1 10469 11447 (249239174) C TAR1 Satellite/telo (399) 1712 483 2
|
6
|
+
437 23.5 18.6 3.5 chr1 11504 11675 (249238946) C L1MC LINE/L1 (2236) 5646 5449 3
|
7
|
+
239 29.4 1.9 1.0 chr1 11678 11780 (249238841) C MER5B DNA/hAT-Charlie (74) 104 1 4
|
8
|
+
318 23.0 3.8 0.0 chr1 15265 15355 (249235266) C MIR3 SINE/MIR (119) 143 49 5
|
9
|
+
203 16.2 0.0 0.0 chr1 16713 16749 (249233872) + (TGG)n Simple_repeat 1 37 (0) 6
|
10
|
+
239 33.8 14.8 0.0 chr1 18907 19048 (249231573) + L2a LINE/L2 2942 3104 (322) 7
|
11
|
+
652 34.6 8.5 4.2 chr1 19948 20405 (249230216) + L3 LINE/CR1 3042 3519 (970) 8
|
12
|
+
270 33.1 0.7 2.7 chr1 20531 20679 (249229942) + Plat_L3 LINE/CR1 2802 2947 (639) 9
|
13
|
+
254 27.9 4.7 3.9 chr1 21949 22075 (249228546) + MLT1K LTR/ERVL-MaLR 15 142 (453) 10
|
14
|
+
787 28.2 3.9 0.3 chr1 23120 23371 (249227250) C MIR SINE/MIR (6) 262 4 11
|
15
|
+
312 28.4 20.4 4.3 chr1 23804 24038 (249226583) + L2b LINE/L2 2940 3212 (175) 12
|
16
|
+
413 24.2 31.3 1.2 chr1 24088 24250 (249226371) + MIR SINE/MIR 49 260 (2) 13
|
17
|
+
299 30.6 2.1 12.4 chr1 24255 24448 (249226173) + L2b LINE/L2 3213 3425 (1) 12
|
18
|
+
188 18.1 7.1 3.7 chr1 26356 26412 (249224209) + MIR SINE/MIR 102 159 (109) 14
|
19
|
+
241 30.3 22.2 3.1 chr1 26583 26790 (249223831) C L2c LINE/L2 (14) 3373 3116 15
|
20
|
+
2070 9.5 0.4 0.0 chr1 26791 27053 (249223568) + AluSp SINE/Alu 1 264 (49) 16
|
21
|
+
241 30.3 22.2 3.1 chr1 27054 27137 (249223484) C L2c LINE/L2 (272) 3115 3026 15
|
22
|
+
1300 14.6 6.9 0.1 chr1 27269 27518 (249223103) + MER33 DNA/hAT-Charlie 1 254 (70) 17
|
23
|
+
275 33.6 9.2 2.0 chr1 27833 28014 (249222607) C MIRb SINE/MIR (70) 198 1 18
|
24
|
+
304 32.0 5.9 3.3 chr1 28151 28302 (249222319) C MIR SINE/MIR (8) 254 99 19
|
25
|
+
1323 19.9 2.7 2.0 chr1 29902 30198 (249220423) + L1MB3 LINE/L1 5883 6181 (2) 20
|
26
|
+
979 14.4 3.2 1.6 chr1 30343 30532 (249220089) C MER53 DNA/hAT (0) 193 1 21
|
27
|
+
741 18.7 4.5 0.0 chr1 30694 30848 (249219773) + MLT1A LTR/ERVL-MaLR 1 162 (212) 22
|
28
|
+
444 11.6 0.0 3.1 chr1 30855 30952 (249219669) + (TC)n Simple_repeat 2 96 (0) 23
|
29
|
+
741 24.9 0.0 1.1 chr1 30953 31131 (249219490) + MLT1A LTR/ERVL-MaLR 173 349 (25) 22
|
30
|
+
407 28.8 7.3 3.0 chr1 31293 31435 (249219186) + MIRc SINE/MIR 67 217 (51) 24
|
31
|
+
2059 14.1 0.3 0.3 chr1 31436 31733 (249218888) + AluJo SINE/Alu 1 298 (14) 25
|
32
|
+
407 28.8 7.3 3.0 chr1 31734 31754 (249218867) + MIRc SINE/MIR 218 238 (30) 24
|
33
|
+
656 23.6 12.2 3.0 chr1 32841 33037 (249217584) + MIR SINE/MIR 2 216 (46) 26
|
34
|
+
2058 18.3 4.9 0.0 chr1 33048 33456 (249217165) + L1MB5 LINE/L1 5746 6174 (0) 27
|
35
|
+
257 25.0 0.0 0.0 chr1 33466 33509 (249217112) + AluYc SINE/Alu 1 44 (255) 28
|
36
|
+
4051 7.0 0.4 0.0 chr1 33529 34041 (249216580) C L1PA6 LINE/L1 (0) 6154 5640 29
|
37
|
+
456 8.2 0.0 0.0 chr1 34048 34108 (249216513) + L1P1 LINE/L1 2970 3030 (3116) 30
|
38
|
+
351 23.6 1.8 3.6 chr1 34451 34560 (249216061) C L2c LINE/L2 (231) 3156 3049 31
|
39
|
+
850 24.9 15.1 0.8 chr1 34565 34921 (249215700) C MLT1J2 LTR/ERVL-MaLR (0) 448 41 32
|
40
|
+
335 29.5 12.7 2.7 chr1 35217 35366 (249215255) C MIRb SINE/MIR (13) 255 91 33
|
41
|
+
1000 13.5 0.0 0.0 chr1 35367 35499 (249215122) + AluJr SINE/Alu 1 133 (179) 34
|
42
|
+
1566 19.4 3.6 2.8 chr1 37045 37431 (249213190) + Charlie5 DNA/hAT-Charlie 2234 2623 (1) 35
|
43
|
+
236 34.8 3.0 0.4 chr1 37733 37861 (249212760) + L2c LINE/L2 3242 3375 (0) 36
|
44
|
+
588 17.4 7.5 0.8 chr1 38059 38191 (249212430) + L2a LINE/L2 2705 2846 (573) 37
|
45
|
+
3877 22.9 14.0 1.6 chr1 38256 39464 (249211157) + MLT1E1A-int LTR/ERVL-MaLR 131 1489 (0) 38
|
46
|
+
750 27.6 9.4 6.4 chr1 39465 39623 (249210998) + MLT1E1A LTR/ERVL-MaLR 1 172 (388) 38
|
47
|
+
2292 12.0 0.0 0.0 chr1 39624 39924 (249210697) + AluSx SINE/Alu 1 301 (11) 39
|
48
|
+
783 26.6 14.1 3.8 chr1 39925 40294 (249210327) + MLT1E1A LTR/ERVL-MaLR 173 666 (14) 38
|
49
|
+
747 30.4 4.1 3.7 chr1 40333 40626 (249209995) + L2a LINE/L2 2923 3217 (209) 37
|
50
|
+
666 12.9 4.0 0.0 chr1 40629 40729 (249209892) C AluSz6 SINE/Alu (0) 312 208 40
|
51
|
+
260 38.5 1.4 0.0 chr1 40736 40878 (249209743) C LTR16C LTR/ERVL (113) 376 232 41
|
52
|
+
1118 35.4 9.4 2.6 chr1 41380 42285 (249208336) C ERVL-E-int LTR/ERVL (685) 4982 4016 42
|
53
|
+
341 27.8 5.9 1.5 chr1 42370 42504 (249208117) + MamRep1527 LTR 359 499 (470) 43
|
54
|
+
7010 19.2 2.7 2.5 chr1 43243 44835 (249205786) + L1MA8 LINE/L1 3715 5305 (838) 44
|
55
|
+
320 0.0 0.0 2.4 chr1 44836 44876 (249205745) + (TAAA)n Simple_repeat 2 41 (0) 45
|
56
|
+
7010 17.3 9.3 1.4 chr1 44877 45753 (249204868) + L1MA8 LINE/L1 5306 6290 (1) 44
|
57
|
+
249 21.2 1.0 2.0 chr1 45887 45987 (249204634) + L1M5 LINE/L1 5334 5433 (691) 46
|
58
|
+
458 19.3 0.0 5.0 chr1 46079 46198 (249204423) + L1MA9 LINE/L1 6167 6280 (32) 47
|
59
|
+
24 0.0 0.0 0.0 chr1 46217 46240 (249204381) + AT_rich Low_complexity 1 24 (0) 48
|
60
|
+
631 3.9 0.0 0.0 chr1 46416 46493 (249204128) C LTR12F LTR/ERV1 (115) 404 327 49
|
61
|
+
631 26.6 1.8 0.6 chr1 46553 46722 (249203899) + MER45A DNA/hAT-Tip100 7 178 (0) 50
|
62
|
+
875 22.6 9.5 0.5 chr1 46893 47092 (249203529) C MER58A DNA/hAT-Charlie (4) 220 3 51
|
63
|
+
5956 9.6 2.1 0.6 chr1 48417 49518 (249201103) + L1PA14 LINE/L1 5047 6161 (0) 52
|
64
|
+
29 0.0 0.0 0.0 chr1 50129 50157 (249200464) + AT_rich Low_complexity 1 29 (0) 53
|
65
|
+
288 0.0 0.0 0.0 chr1 50482 50513 (249200108) + (TG)n Simple_repeat 2 33 (0) 54
|
66
|
+
217 12.7 3.6 1.8 chr1 50571 50626 (249199995) + T-rich Low_complexity 4 60 (0) 55
|
67
|
+
2368 7.4 0.3 0.0 chr1 51585 51880 (249198741) + AluY SINE/Alu 1 297 (14) 56
|
68
|
+
21 0.0 0.0 0.0 chr1 52146 52166 (249198455) + AT_rich Low_complexity 1 21 (0) 57
|
69
|
+
25 3.1 0.0 0.0 chr1 52209 52240 (249198381) + AT_rich Low_complexity 1 32 (0) 58
|
70
|
+
29 2.8 0.0 0.0 chr1 53745 53780 (249196841) + AT_rich Low_complexity 1 36 (0) 59
|
71
|
+
830 35.1 11.2 3.4 chr1 54465 54712 (249195909) + L2 LINE/L2 2090 2350 (1069) 60
|
72
|
+
761 5.7 0.0 1.9 chr1 54713 54820 (249195801) + (TTTC)n Simple_repeat 3 108 (0) 61
|
73
|
+
670 23.3 0.0 0.0 chr1 54822 54937 (249195684) C FLAM_A SINE/Alu (26) 116 1 62
|
74
|
+
830 35.1 11.2 3.4 chr1 54939 55608 (249195013) + L2 LINE/L2 2351 3082 (337) 60
|
75
|
+
441 24.0 8.5 4.6 chr1 59056 59208 (249191413) C L2a LINE/L2 (5) 3421 3263 63
|
76
|
+
1529 17.2 18.2 0.0 chr1 60379 60686 (249189935) C MER47A DNA/TcMar-Tigger (0) 366 3 64
|
77
|
+
426 30.7 7.2 3.8 chr1 61696 61862 (249188759) C L1M5 LINE/L1 (475) 5649 5481 65
|
78
|
+
2412 8.1 0.0 0.3 chr1 61863 62160 (249188461) C AluSc SINE/Alu (12) 297 1 66
|
79
|
+
426 30.7 7.2 3.8 chr1 62161 62229 (249188392) C L1M5 LINE/L1 (644) 5480 5406 65
|
80
|
+
204 4.0 0.0 0.0 chr1 62232 62256 (249188365) + (CA)n Simple_repeat 2 26 (0) 67
|
81
|
+
1994 3.7 0.0 0.0 chr1 64426 64666 (249185955) + L1PA4 LINE/L1 5911 6151 (4) 68
|
82
|
+
306 35.2 2.5 0.8 chr1 65977 66134 (249184487) C MIR3 SINE/MIR (9) 199 45 69
|
83
|
+
391 3.9 7.6 1.3 chr1 66158 66236 (249184385) + (TA)n Simple_repeat 2 85 (0) 70
|
84
|
+
624 7.0 2.8 5.0 chr1 66237 66415 (249184206) + (TATAA)n Simple_repeat 5 179 (0) 71
|
85
|
+
514 8.8 5.6 1.9 chr1 66379 66540 (249184081) + (TTATA)n Simple_repeat 4 171 (0) 72
|
86
|
+
398 6.7 8.9 0.0 chr1 66541 66630 (249183991) + (TA)n Simple_repeat 1 98 (0) 73
|
87
|
+
539 33.9 7.3 1.2 chr1 67999 68246 (249182375) C MIRb SINE/MIR (0) 268 6 74
|
88
|
+
223 29.2 1.2 5.3 chr1 68507 68676 (249181945) + T-rich Low_complexity 4 166 (0) 75
|
89
|
+
26 0.0 0.0 0.0 chr1 70342 70367 (249180254) + AT_rich Low_complexity 1 26 (0) 76
|
90
|
+
350 31.0 8.7 0.0 chr1 70504 70712 (249179909) C LTR89 LTR/ERVL? (259) 620 336 77
|
91
|
+
557 29.9 6.0 2.6 chr1 70723 70955 (249179666) + MIRb SINE/MIR 26 266 (2) 78
|
92
|
+
398 34.8 5.4 4.7 chr1 71369 71645 (249178976) + L1MC4a LINE/L1 4810 5088 (1058) 79
|
93
|
+
232 28.1 10.1 0.0 chr1 71741 71829 (249178792) + L1MC4a LINE/L1 5254 5351 (2531) 79
|
94
|
+
381 18.7 0.0 0.0 chr1 72090 72164 (249178457) + (TA)n Simple_repeat 2 76 (0) 80
|
95
|
+
2634 7.0 1.5 0.0 chr1 72185 72525 (249178096) C L1PA7 LINE/L1 (4) 6150 5805 81
|
96
|
+
2919 6.2 1.1 0.0 chr1 72526 72897 (249177724) + L1PA7 LINE/L1 5415 5790 (364) 81
|
97
|
+
3753 17.5 0.6 0.3 chr1 73009 73678 (249176943) C L1PA16 LINE/L1 (4) 6162 5491 82
|
98
|
+
425 24.2 3.2 4.5 chr1 73688 73843 (249176778) + L1MC4a LINE/L1 5372 5525 (2357) 79
|
99
|
+
7846 1.6 0.0 0.0 chr1 73845 74895 (249175726) C L1PA2 LINE/L1 (0) 6155 5105 83
|
100
|
+
15365 2.1 0.0 0.0 chr1 74898 76697 (249173924) + L1PA2 LINE/L1 3311 5110 (1036) 83
|
@@ -0,0 +1,9 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require "bio-repeatmasker"
|
4
|
+
|
5
|
+
foo=RptMskr::Parser.open("chr1.fa.out_head100")
|
6
|
+
#IDs=RptMskr::Parser.search_by_id(["2","38"])
|
7
|
+
#fields=RptMskr::Parser.search_by_field("repeat_name",["AT_rich"])
|
8
|
+
#coord_interval=RptMskr::Parser.search_by_coordinate_interval("chr1","73845","76697")
|
9
|
+
puts RptMskr::Parser.search_by_field("repeat_name",["AT_rich"])
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-repeatmasker
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andrei Rozanski
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-01-11 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: A simple gem to parse, filter/search raw RepeatMasker output file from
|
15
|
+
GoldenPath
|
16
|
+
email: andrei@ruivo.org
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/bio-repeatmasker.rb
|
22
|
+
- test/test_bio-repeatmasker.rb
|
23
|
+
- test/chr1.fa.out_head100
|
24
|
+
homepage: http://rubygems.org/gems/bio-repeatmasker
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.23
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: bio-repeatmasker
|
48
|
+
test_files:
|
49
|
+
- test/test_bio-repeatmasker.rb
|
50
|
+
- test/chr1.fa.out_head100
|