rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +56 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +61 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +42 -12
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -361
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -355
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -52
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+
4
+ class TestDocument < Test::Unit::TestCase
5
+
6
+ def test_docid
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
11
+ end
12
+
13
+ end
14
+
@@ -0,0 +1,182 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+
6
+ class TestSegment < Test::Unit::TestCase
7
+ def test_segment
8
+ text = "This is a document"
9
+ Document.setup(text, "TEST", "test_doc1", nil)
10
+
11
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
12
+
13
+ assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
14
+ end
15
+
16
+ def test_segid
17
+ text = "This is a document"
18
+ Document.setup(text, "TEST", "test_doc1", nil)
19
+
20
+ corpus = Document::Corpus.setup({})
21
+
22
+ corpus.add_document(text)
23
+
24
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
25
+
26
+ segid = segment.segid(corpus)
27
+
28
+ segment = segid.segment
29
+ assert_equal "is", segment
30
+ end
31
+
32
+ def test_info
33
+ segment = "test"
34
+ segment.extend Segment
35
+ segment.offset = 10
36
+ assert segment.info.include? :offset
37
+ end
38
+
39
+ def test_sort
40
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
41
+ Document.setup(text, "TEST", "test_doc1", nil)
42
+
43
+ corpus = Document::Corpus.setup({})
44
+
45
+ corpus.add_document(text)
46
+
47
+ gene1 = "TP53"
48
+ gene1.extend Segment
49
+ gene1.offset = text.index gene1
50
+ gene1.docid = text.docid
51
+
52
+ gene2 = "CDK5R1"
53
+ gene2.extend Segment
54
+ gene2.offset = text.index gene2
55
+ gene2.docid = text.docid
56
+
57
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
58
+
59
+ assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
60
+ end
61
+
62
+ def test_clean_sort
63
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
64
+ Document.setup(text, "TEST", "test_doc1", nil)
65
+
66
+ corpus = Document::Corpus.setup({})
67
+
68
+ corpus.add_document(text)
69
+
70
+ gene1 = "TP53"
71
+ gene1.extend Segment
72
+ gene1.offset = text.index gene1
73
+ gene1.docid = text.docid
74
+
75
+ gene2 = "CDK5R1"
76
+ gene2.extend Segment
77
+ gene2.offset = text.index gene2
78
+ gene2.docid = text.docid
79
+
80
+ gene3 = "TP53 gene"
81
+ gene3.extend Segment
82
+ gene3.offset = text.index gene1
83
+ gene3.docid = text.docid
84
+
85
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
86
+
87
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
88
+ end
89
+
90
+ def test_split
91
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
92
+ Document.setup(text, "TEST", "test_doc1", nil)
93
+
94
+ corpus = Document::Corpus.setup({})
95
+
96
+ corpus.add_document(text)
97
+
98
+ gene1 = "TP53"
99
+ gene1.extend Segment
100
+ gene1.offset = text.index gene1
101
+ gene1.docid = text.docid
102
+
103
+ gene2 = "CDK5R1"
104
+ gene2.extend Segment
105
+ gene2.offset = text.index gene2
106
+ gene2.docid = text.docid
107
+
108
+ gene3 = "TP53 gene"
109
+ gene3.extend Segment
110
+ gene3.offset = text.index gene1
111
+ gene3.docid = text.docid
112
+
113
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
114
+
115
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
116
+ end
117
+
118
+
119
+ def test_align
120
+ text =<<-EOF
121
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
122
+ EOF
123
+
124
+ parts = text.split(/\W/)
125
+ Segment.align(text, parts)
126
+
127
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
128
+
129
+ Document.setup(text, "TEST", "test_doc1", nil)
130
+
131
+ parts = text.split(/\W/)
132
+ Segment.align(text, parts)
133
+
134
+ assert_equal parts.first.docid, text.docid
135
+ end
136
+
137
+ def test_segment_index
138
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
139
+ Document.setup(text, "TEST", "test_doc1", nil)
140
+
141
+ corpus = Document::Corpus.setup({})
142
+
143
+ corpus.add_document(text)
144
+
145
+ gene1 = "TP53"
146
+ gene1.extend Segment
147
+ gene1.offset = text.index gene1
148
+ gene1.docid = text.docid
149
+
150
+ gene2 = "CDK5R1"
151
+ gene2.extend Segment
152
+ gene2.offset = text.index gene2
153
+ gene2.docid = text.docid
154
+
155
+ gene3 = "TP53 gene"
156
+ gene3.extend Segment
157
+ gene3.offset = text.index gene1
158
+ gene3.docid = text.docid
159
+
160
+ index = Segment.index([gene1, gene2, gene3], corpus)
161
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
162
+
163
+ TmpFile.with_file do |fwt|
164
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
165
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
166
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
167
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
168
+ end
169
+
170
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
171
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
172
+
173
+ TmpFile.with_file do |fwt|
174
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
175
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
176
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
177
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
178
+ end
179
+ end
180
+
181
+ end
182
+
@@ -6,7 +6,7 @@ require 'rbbt'
6
6
  require 'rbbt/persist'
7
7
  require 'rbbt/util/tmpfile'
8
8
  require 'rbbt/util/log'
9
- require 'rbbt/text/corpus'
9
+ #require 'rbbt/text/corpus'
10
10
 
11
11
  class Test::Unit::TestCase
12
12
  def get_test_datafile(file)
@@ -22,8 +22,10 @@ class Test::Unit::TestCase
22
22
  FileUtils.rm_rf Rbbt.tmp.test.find :user
23
23
  Persist::CONNECTIONS.values.each do |c| c.close end
24
24
  Persist::CONNECTIONS.clear
25
- Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
- Corpus::DocumentRepo::TC_CONNECTIONS.clear
25
+ if defined? Corpus
26
+ Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
27
+ Corpus::DocumentRepo::TC_CONNECTIONS.clear
28
+ end
27
29
  end
28
30
 
29
31
  end
@@ -0,0 +1,32 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
2
+ require 'rbbt/nlp/spaCy'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestSpaCy < Test::Unit::TestCase
6
+ def _test_tokens
7
+ text = "I tell a story"
8
+
9
+ tokens = SpaCy.tokens(text)
10
+
11
+ assert_equal 4, tokens.length
12
+ assert_equal "tell", tokens[1].to_s
13
+ end
14
+
15
+ def test_segments
16
+ text = "I tell a story. It's a very good story."
17
+
18
+ corpus = Document::Corpus.setup({})
19
+
20
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
21
+
22
+ corpus.add_document text
23
+ text.corpus = corpus
24
+
25
+ segments = SpaCy.segments(text)
26
+
27
+ segments.each do |segment|
28
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
29
+ end
30
+ end
31
+ end
32
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.9
4
+ version: 1.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-13 00:00:00.000000000 Z
11
+ date: 2020-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -78,6 +78,10 @@ files:
78
78
  - lib/rbbt/bow/bow.rb
79
79
  - lib/rbbt/bow/dictionary.rb
80
80
  - lib/rbbt/bow/misc.rb
81
+ - lib/rbbt/document.rb
82
+ - lib/rbbt/document/annotation.rb
83
+ - lib/rbbt/document/corpus.rb
84
+ - lib/rbbt/document/corpus/pubmed.rb
81
85
  - lib/rbbt/ner/NER.rb
82
86
  - lib/rbbt/ner/abner.rb
83
87
  - lib/rbbt/ner/banner.rb
@@ -98,18 +102,18 @@ files:
98
102
  - lib/rbbt/nlp/genia/sentence_splitter.rb
99
103
  - lib/rbbt/nlp/nlp.rb
100
104
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
101
- - lib/rbbt/text/corpus.rb
102
- - lib/rbbt/text/corpus/document.rb
103
- - lib/rbbt/text/corpus/document_repo.rb
104
- - lib/rbbt/text/corpus/sources/pmid.rb
105
- - lib/rbbt/text/document.rb
106
- - lib/rbbt/text/segment.rb
107
- - lib/rbbt/text/segment/docid.rb
108
- - lib/rbbt/text/segment/named_entity.rb
109
- - lib/rbbt/text/segment/relationship.rb
110
- - lib/rbbt/text/segment/segmented.rb
111
- - lib/rbbt/text/segment/token.rb
112
- - lib/rbbt/text/segment/transformed.rb
105
+ - lib/rbbt/nlp/spaCy.rb
106
+ - lib/rbbt/segment.rb
107
+ - lib/rbbt/segment/annotation.rb
108
+ - lib/rbbt/segment/encoding.rb
109
+ - lib/rbbt/segment/named_entity.rb
110
+ - lib/rbbt/segment/overlaps.rb
111
+ - lib/rbbt/segment/range_index.rb
112
+ - lib/rbbt/segment/relationship.rb
113
+ - lib/rbbt/segment/segmented.rb
114
+ - lib/rbbt/segment/token.rb
115
+ - lib/rbbt/segment/transformed.rb
116
+ - lib/rbbt/segment/tsv.rb
113
117
  - share/install/software/ABNER
114
118
  - share/install/software/BANNER
115
119
  - share/install/software/ChemicalTagger
@@ -128,6 +132,9 @@ files:
128
132
  - test/rbbt/bow/test_bow.rb
129
133
  - test/rbbt/bow/test_dictionary.rb
130
134
  - test/rbbt/bow/test_misc.rb
135
+ - test/rbbt/document/corpus/test_pubmed.rb
136
+ - test/rbbt/document/test_annotation.rb
137
+ - test/rbbt/document/test_corpus.rb
131
138
  - test/rbbt/entity/test_document.rb
132
139
  - test/rbbt/ner/test_NER.rb
133
140
  - test/rbbt/ner/test_abner.rb
@@ -146,16 +153,17 @@ files:
146
153
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
147
154
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
148
155
  - test/rbbt/nlp/test_nlp.rb
149
- - test/rbbt/text/corpus/sources/test_pmid.rb
150
- - test/rbbt/text/corpus/test_document.rb
151
- - test/rbbt/text/segment/test_named_entity.rb
152
- - test/rbbt/text/segment/test_relationship.rb
153
- - test/rbbt/text/segment/test_segmented.rb
154
- - test/rbbt/text/segment/test_transformed.rb
155
- - test/rbbt/text/test_corpus.rb
156
- - test/rbbt/text/test_document.rb
157
- - test/rbbt/text/test_segment.rb
156
+ - test/rbbt/segment/test_annotation.rb
157
+ - test/rbbt/segment/test_corpus.rb
158
+ - test/rbbt/segment/test_encoding.rb
159
+ - test/rbbt/segment/test_named_entity.rb
160
+ - test/rbbt/segment/test_overlaps.rb
161
+ - test/rbbt/segment/test_range_index.rb
162
+ - test/rbbt/segment/test_transformed.rb
163
+ - test/rbbt/test_document.rb
164
+ - test/rbbt/test_segment.rb
158
165
  - test/test_helper.rb
166
+ - test/test_spaCy.rb
159
167
  homepage: http://github.com/mikisvaz/rbbt-util
160
168
  licenses: []
161
169
  metadata: {}
@@ -182,18 +190,13 @@ test_files:
182
190
  - test/rbbt/nlp/test_nlp.rb
183
191
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
184
192
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
185
- - test/rbbt/text/test_document.rb
186
- - test/rbbt/text/corpus/sources/test_pmid.rb
187
- - test/rbbt/text/corpus/test_document.rb
188
- - test/rbbt/text/test_segment.rb
189
- - test/rbbt/text/test_corpus.rb
190
- - test/rbbt/text/segment/test_transformed.rb
191
- - test/rbbt/text/segment/test_relationship.rb
192
- - test/rbbt/text/segment/test_named_entity.rb
193
- - test/rbbt/text/segment/test_segmented.rb
194
193
  - test/rbbt/bow/test_bow.rb
195
194
  - test/rbbt/bow/test_misc.rb
196
195
  - test/rbbt/bow/test_dictionary.rb
196
+ - test/rbbt/test_document.rb
197
+ - test/rbbt/document/test_annotation.rb
198
+ - test/rbbt/document/corpus/test_pubmed.rb
199
+ - test/rbbt/document/test_corpus.rb
197
200
  - test/rbbt/entity/test_document.rb
198
201
  - test/rbbt/ner/test_patterns.rb
199
202
  - test/rbbt/ner/test_NER.rb
@@ -209,4 +212,13 @@ test_files:
209
212
  - test/rbbt/ner/test_finder.rb
210
213
  - test/rbbt/ner/test_linnaeus.rb
211
214
  - test/rbbt/ner/test_oscar4.rb
215
+ - test/rbbt/test_segment.rb
216
+ - test/rbbt/segment/test_transformed.rb
217
+ - test/rbbt/segment/test_overlaps.rb
218
+ - test/rbbt/segment/test_annotation.rb
219
+ - test/rbbt/segment/test_named_entity.rb
220
+ - test/rbbt/segment/test_encoding.rb
221
+ - test/rbbt/segment/test_range_index.rb
222
+ - test/rbbt/segment/test_corpus.rb
223
+ - test/test_spaCy.rb
212
224
  - test/test_helper.rb
@@ -1,106 +0,0 @@
1
- require 'rbbt/text/corpus/document'
2
- require 'rbbt/text/corpus/document_repo'
3
-
4
- class Corpus
5
- class << self
6
- attr_accessor :claims
7
- def claim(namespace, &block)
8
- @@claims = {}
9
- @@claims[namespace] = block
10
- end
11
-
12
- end
13
- attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
14
-
15
- def initialize(corpora_path = nil)
16
- @corpora_path = case
17
- when corpora_path.nil?
18
- Rbbt.corpora
19
- when (not Path === corpora_path)
20
- Path.setup(corpora_path)
21
- else
22
- corpora_path
23
- end
24
-
25
- @corpora_path = @corpora_path.find
26
- @persistence_dir = File.join(@corpora_path, "annotations")
27
-
28
- Misc.lock(@persistence_dir) do
29
- @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
30
- @global_annotations.unnamed = true
31
- @global_annotations.close
32
- end
33
-
34
- Misc.lock(@corpora_path.document_repo) do
35
- @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
36
- @document_repo.close
37
- end
38
-
39
- end
40
-
41
- def persistence_for(docid)
42
- File.join(persistence_dir, docid)
43
- end
44
-
45
-
46
- def docid(docid)
47
- begin
48
- if @document_repo.include?(docid)
49
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
50
- else
51
- namespace, id, type = docid.split(":")
52
- if @@claims.include?(namespace)
53
-
54
- docid = self.instance_exec id, type, &(@@claims[namespace])
55
- docid = docid.first if Array === docid
56
- self.docid(docid)
57
- else
58
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
59
- end
60
- end
61
- ensure
62
- @document_repo.close
63
- end
64
- end
65
-
66
- def document(namespace, id, type, hash)
67
- docid = [namespace, id, type, hash] * ":"
68
- self.docid(docid)
69
- end
70
-
71
- def add_document(text, namespace = nil, id = nil, type = nil)
72
- text = Misc.fixutf8(text)
73
- hash = Digest::MD5.hexdigest(text)
74
- @document_repo.add(text, namespace, id, type, hash)
75
- end
76
-
77
- def add_docid(text, docid)
78
- namespace, id, type, hash = docid.split(":")
79
- @document_repo.add(text, namespace, id, type, hash)
80
- end
81
-
82
-
83
- def find(namespace=nil, id = nil, type = nil, hash = nil)
84
- @document_repo.find(namespace, id, type, hash).collect{|docid|
85
- self.docid(docid)
86
- }
87
- end
88
-
89
- def find_docid(docid)
90
- @document_repo.find_docid(docid).collect{|docid|
91
- self.docid(docid)
92
- }
93
- end
94
-
95
- def exists?(namespace=nil, id = nil, type = nil, hash = nil)
96
- find(namespace, id, type, hash).any?
97
- end
98
-
99
- def [](docid)
100
- self.docid(docid)
101
- end
102
-
103
- def include?(id)
104
- @document_repo.include? id
105
- end
106
- end