rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,251 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/tmpfile'
3
-
4
-
5
- # Provides with a few helper functions to read and write files, as well # as
6
- # for accessing remote files. It supports caching the files.
7
- module Open
8
-
9
- # Return a Proc to use in the :select parameter of the Open.to_hash method.
10
- # It selects those lines with the content of the first field present on the
11
- # entities array. The field can be chosen to be a different one in the
12
- # options hash, also the separation string or regexp to determine fields.
13
- def self.func_match_field(entities, options = {})
14
- field, sep = {:field => 0, :sep => "\t"}.merge(options).values_at(:field, :sep)
15
-
16
- Proc.new {|line| entities.include? line.split(sep)[field] }
17
- end
18
-
19
- def self.fields(line, sep = "\t")
20
- line << sep
21
- line << "PLACEHOLDER"
22
- chunks = line.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
23
- if line =~ /#{sep}$/
24
- chunks << ""
25
- end
26
- chunks.pop
27
- chunks
28
- end
29
-
30
- class DirectoryNotFoundError < StandardError; end
31
- class OpenURLError < StandardError; end
32
-
33
- private
34
-
35
- @@remote_cachedir = File.join(Rbbt.cachedir, 'open-remote/')
36
- FileUtils.mkdir @@remote_cachedir unless File.exist? @@remote_cachedir
37
-
38
- # If no data is specified and the url is found in the cache the saved
39
- # contents are returned, if not found, the url is opened and the contents of
40
- # that are returned. If +data+ is specified then it is saved in the
41
- # cache under +url+. To match +url+ in the cache a MD5 digest is used.
42
- # The location of the cache directory is bu default
43
- # File.join(Rbbt.cachedir, 'open-remote/').
44
- def self.cache(url, data = nil)
45
- require 'digest/md5'
46
- digest = Digest::MD5.hexdigest(url)
47
-
48
- if data
49
- Open.write(File.join(@@remote_cachedir, digest), data)
50
- return nil
51
- else
52
- if File.exist? File.join(@@remote_cachedir, digest)
53
- return File.open(File.join(@@remote_cachedir, digest)){|file| file.read }
54
- else
55
- return nil
56
- end
57
- end
58
- end
59
-
60
- # Checks if +url+ is a remote file.
61
- def self.remote(url)
62
- url =~ /^(?:http|ssh|https|ftp):\/\//
63
- end
64
-
65
-
66
- # Checks if +url+ is a gzip file.
67
- def self.gziped(url)
68
- if remote(url)
69
- return url =~ /\.gz$/ || url =~ /\.gz\?.*$/
70
- else
71
- return url =~ /\.gz$/
72
- end
73
- end
74
-
75
-
76
- @@last_time = Time.now
77
- def self.wait(lag = 0)
78
- time = Time.now
79
-
80
- if time < @@last_time + lag
81
- sleep @@last_time + lag - time
82
- end
83
-
84
- @@last_time = Time.now
85
- end
86
-
87
- public
88
- # Reads the file specified by url. If the url es local it just opens
89
- # the file, if it is remote if checks the cache first. In any case, it
90
- # unzips gzip files automatically.
91
- #
92
- # Options:
93
- # * :quiet => Do not print the progress of downloads
94
- # * :nocache => do not use the cache.
95
- # * :nice => secconds to wait between online queries
96
- #
97
- def self.read(url, options = {})
98
-
99
- case
100
- when remote(url)
101
- if !options[:nocache] && data = cache(url)
102
- return data
103
- end
104
-
105
- wait(options[:nice]) if options[:nice]
106
- tmp = TmpFile.tmp_file("open-")
107
- `wget --user-agent=firefox -O #{tmp} '#{url}' #{options[:quiet] ? '-q' : '' }`
108
-
109
- if $?.success?
110
- if gziped(url)
111
- `mv #{tmp} #{tmp}.gz; gunzip #{tmp}`
112
- end
113
-
114
- cache(url, File.open(tmp){|file| file.read}) unless options[:nocache]
115
-
116
- data = File.open(tmp){|file| file.read}
117
- FileUtils.rm tmp
118
- return data
119
- else
120
- raise OpenURLError, "Error reading remote url: #{ url }"
121
- end
122
-
123
- when IO === url
124
- url.read
125
- else
126
- return File.open(url){|file| file.read}
127
- end
128
-
129
- end
130
-
131
- # Writes the contents on the path specified by filename
132
- #
133
- # Options:
134
- # * :force => Create directories if missing.
135
- def self.write(filename, content, options = {})
136
- if !File.exist? File.dirname(filename)
137
- if options[:force]
138
- FileUtils.makedirs(File.dirname(filename))
139
- else
140
- raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
141
- end
142
- end
143
-
144
- File.open(filename,'w'){|f|
145
- f.write content
146
- }
147
-
148
- nil
149
- end
150
-
151
- # Writes the contents on the path specified by filename. If the file
152
- # is present it appends the contents.
153
- #
154
- # Options:
155
- # * :force => Create directories if missing.
156
- def self.append(filename, content, options ={})
157
- if !File.exist? File.dirname(filename)
158
- if options[:force]
159
- FileUtils.makedirs(File.dirname(filename))
160
- else
161
- raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
162
- end
163
- end
164
-
165
- f = File.open(filename,'a')
166
- f.write content
167
- f.close
168
-
169
- nil
170
- end
171
-
172
-
173
-
174
- # Reads a file with rows with elementes separated by a given pattern
175
- # and builds a hash with it. The keys of the hash are the elements in
176
- # the :native positions, by default the first (0). The value for each
177
- # key is an array with one position for each of the rest possible
178
- # positions specified in :extra, by default all but the :native. Since
179
- # the native key may be repeated, each of the positions of the values
180
- # is in itself an array. There are a number of options to change this
181
- # behaviour.
182
- #
183
- # Options:
184
- # * :native => position of the elements that will constitute the keys. By default 0.
185
- # * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
186
- # * :sep => pattern to use in splitting the lines into elements, by default "\t"
187
- # * :sep2 => pattern to use in splitting the elements into subelements, by default "|"
188
- # * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
189
- # * :single => for each key select only the first of the values, instead of the complete array.
190
- # * :fix => A Proc that is called to pre-process the line
191
- # * :exclude => A Proc that is called to check if the line must be excluded from the process.
192
- # * :select => A Proc that is called to check if the line must be selected to process.
193
- def self.to_hash(input, options = {})
194
- native = options[:native] || 0
195
- extra = options[:extra]
196
- exclude = options[:exclude]
197
- select = options[:select]
198
- fix = options[:fix]
199
- sep = options[:sep] || "\t"
200
- sep2 = options[:sep2] || "|"
201
- single = options[:single]
202
- single = false if single.nil?
203
- flatten = options[:flatten]
204
- flatten = single if flatten.nil?
205
-
206
- extra = [extra] if extra && ! extra.is_a?( Array)
207
-
208
- if StringIO === input
209
- content = input
210
- else
211
- content = Open.read(input)
212
- end
213
-
214
- data = {}
215
- content.each_line{|l|
216
- l = fix.call(l) if fix
217
- next if exclude and exclude.call(l)
218
- next if select and ! select.call(l)
219
-
220
- row_fields = self.fields(l.chomp, sep)
221
- id = row_fields[native]
222
- next if id.nil? || id == ""
223
-
224
- data[id] ||= []
225
-
226
- if extra
227
- row_fields = row_fields.values_at(*extra)
228
- else
229
- row_fields.delete_at(native)
230
- end
231
-
232
-
233
- if flatten
234
- data[id] += row_fields.compact.collect{|v|
235
- v.split(sep2)
236
- }.flatten
237
- else
238
- row_fields.each_with_index{|value, i|
239
- next if value.nil?
240
- data[id][i] ||= []
241
- data[id][i] += value.split(sep2)
242
- }
243
- end
244
- }
245
-
246
- data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
247
-
248
- data
249
- end
250
-
251
- end
@@ -1,183 +0,0 @@
1
- require 'rake'
2
-
3
- # Include the step_def and step methods to simplify Pipelines. Steps depend on
4
- # the step strictly above by default. The output of the step is save marshaled,
5
- # except for Strings which are save as text. The input of the step, the output
6
- # of the previous step if availabe is accessed with the input method
7
- #
8
- # Example::
9
- #
10
- # step :text do
11
- # "Text to revert"
12
- # end
13
- #
14
- # step :revert do
15
- # text = input
16
- # text.reverse
17
- # end
18
- #
19
- module Rake::Pipeline
20
-
21
- module Rake::Pipeline::Step
22
-
23
- class << self
24
-
25
- @@step_descriptions = {}
26
- def step_descriptions
27
- @@step_descriptions
28
- end
29
-
30
- def add_description(re, step, message)
31
- @@step_descriptions[re] = "#{ step }: #{ message }"
32
- end
33
-
34
- @@last_step = nil
35
- def step_def(name, dependencies = nil)
36
-
37
- re = Regexp.new(/(?:^|\/)#{name}\/.*$/)
38
-
39
- # Take the last_description and associate it with the name
40
- if Rake.application.last_description
41
- add_description(re, name, Rake.application.last_description)
42
- end
43
-
44
- if dependencies.nil? && ! @@last_step.nil?
45
- dependencies = @@last_step
46
- end
47
- @@last_step = name
48
-
49
- # Generate the Hash definition
50
- case
51
- when dependencies.nil?
52
- re
53
- when String === dependencies || Symbol === dependencies
54
- {re => lambda{|filename| filename.sub(name.to_s,dependencies.to_s) }}
55
- when Array === dependencies
56
- {re => lambda{|filename| dependencies.collect{|dep| filename.sub(name.to_s, dep.to_s) } }}
57
- when Proc === dependencies
58
- {re => dependencies}
59
- end
60
-
61
- end
62
-
63
- end
64
- end
65
-
66
- module Rake::Pipeline::Info
67
-
68
- def self.info_file(filename)
69
- filename.sub(/^(.*?)(?:[^\/]*)\/([^\/]*)$/, '\1.info/\2.yaml')
70
- end
71
-
72
- def self.load_info(t)
73
- filename = t.name
74
- info_filename = info_file(filename)
75
-
76
- if File.exists? info_filename
77
- YAML.load(File.open(info_filename))
78
- else
79
- {}
80
- end
81
- end
82
-
83
- def self.save_info(t, info = {})
84
- filename = t.name
85
- info_filename = info_file(filename)
86
-
87
- FileUtils.mkdir_p File.dirname(info_filename) unless File.exists? File.dirname(info_filename)
88
- File.open(info_filename,'w'){|file|
89
- file.write YAML.dump info
90
- }
91
- end
92
-
93
- end
94
-
95
-
96
- NON_ASCII_PRINTABLE = /[^\x20-\x7e\s]/
97
- def is_binary?(file)
98
- binary = file.read(1024) =~ NON_ASCII_PRINTABLE
99
- file.rewind
100
- binary
101
- end
102
-
103
- def step_descriptions
104
- Rake::Pipeline::Step.step_descriptions
105
- end
106
-
107
-
108
- def step_def(*args)
109
- Rake::Pipeline::Step.step_def(*args)
110
- end
111
-
112
- def infile(t, &block)
113
- File.open(t.prerequisites.first) do |f|
114
- block.call(f)
115
- end
116
- end
117
-
118
- def outfile(t, &block)
119
- File.open(t.name, 'w') do |f|
120
- block.call(f)
121
- end
122
- end
123
-
124
- def load_input(t)
125
- return nil if t.prerequisites.first.nil?
126
- infile(t){|f|
127
- if is_binary?(f)
128
- Marshal.load(f)
129
- else
130
- f.read
131
- end
132
- }
133
- end
134
-
135
- def save_output(t, output)
136
- case
137
- when output.nil?
138
- nil
139
- when String === output
140
- outfile(t){|f| f.write output }
141
- else
142
- outfile(t){|f| f.write Marshal.dump(output) }
143
- end
144
-
145
- end
146
-
147
- # We cannot load the input variable before the block.call, so we need another method
148
-
149
- # Load the input data from the previous step
150
- def input
151
- load_input(@@current_task) if @@current_task
152
- end
153
-
154
- # Add values to the info file
155
- def info(values = {})
156
- info = Rake::Pipeline::Info.load_info(@@current_task)
157
- info = info.merge values
158
- Rake::Pipeline::Info.save_info(@@current_task, info)
159
- info
160
- end
161
-
162
-
163
- # Define a new step, it depends on the previously defined by default. It
164
- # saves the output of the block so it can be loaded by the input method of
165
- # the next step
166
- def step(name, dependencies = nil, &block)
167
- rule step_def(name, dependencies) do |t|
168
-
169
- # Save the task object to be able to load the input
170
- @@current_task = t
171
-
172
- output = block.call(t)
173
-
174
- save_output(t, output)
175
- end
176
-
177
- end
178
- end
179
-
180
- if __FILE__ == $0
181
-
182
- p Rake::Pipeline::Info.info_file('work/diseases/t')
183
- end
@@ -1,87 +0,0 @@
1
- require 'parse_tree_extensions'
2
- require 'parse_tree'
3
- require 'ruby2ruby'
4
-
5
- # This class helps designing DSL in ruby based on method_missing. Class
6
- # is initialize with a block of code or a file with the code, and it is
7
- # given a method to be invoked instead of method missing. This class
8
- # deals simply with making the method_missing alias and removing it and
9
- # executing the block of file with code.
10
- class SimpleDSL
11
-
12
- class ConfigFileMissingError < StandardError; end
13
-
14
- private
15
-
16
- def hook_method(method = nil)
17
- method ||= :DSL_action
18
- @@restore_name = ("restore_DSL_" + method.to_s).to_sym
19
- @@method_name = method.to_sym
20
-
21
- class << self
22
- @restore_stack ||= []
23
- @restore_stack << @@restore_name
24
- alias_method(@@restore_name, :method_missing)
25
- alias_method(:method_missing, @@method_name)
26
- end
27
- end
28
-
29
- def unhook_method
30
- class << self
31
- alias_method(:method_missing, @restore_stack.pop)
32
- end
33
- end
34
-
35
- public
36
-
37
- def parse(method = nil, actions = nil, &block)
38
-
39
- actions ||= block
40
-
41
- hook_method(method)
42
-
43
- # Execute
44
- if actions.is_a? Proc
45
-
46
- @config[@@method_name] = actions.to_ruby.collect[1..-2].join
47
-
48
- instance_eval &actions
49
- elsif File.exists?(actions)
50
-
51
- @config[@@method_name] = File.open(actions).read
52
-
53
- eval File.open(actions).read
54
- end
55
-
56
- unhook_method
57
-
58
- end
59
-
60
-
61
- # Processes a DSL. +method+ is the name of the method executed instead
62
- # of method_missing. The code to be evaluated as a DSL is either
63
- # specified in +&block+ or in the file pointed by +file+.
64
- def initialize(method = nil, file = nil, &block)
65
- @config = {}
66
- if file
67
- raise ConfigFileMissingError.new "File '#{ file }' is missing. Have you installed the config files? (use rbbt_config)." unless File.exists? file
68
- parse(method, file)
69
- end
70
-
71
- if block
72
- parse(method, block)
73
- end
74
- end
75
-
76
- # Returns the code with the DSL that was executed. If it came from a
77
- # block it was turned to string using ruby2ruby.
78
- def config(action = nil)
79
- if action
80
- @config[action.to_sym]
81
- else
82
- @config[:DSL_action]
83
- end
84
- end
85
- end
86
-
87
-