treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,40 +1,41 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
1
|
+
# A wrapper for the Stanford parser's
|
2
|
+
# Penn-Treebank style tokenizer.
|
3
|
+
class Treat::Processors::Tokenizers::Stanford
|
4
|
+
|
5
|
+
require 'treat/loaders/stanford'
|
6
|
+
|
7
|
+
@@tokenizer = nil
|
8
|
+
|
9
|
+
# Tokenize the entity using a Penn-Treebank
|
10
|
+
# style tokenizer.
|
11
|
+
#
|
12
|
+
# Options: none.
|
13
|
+
def self.tokenize(entity, options = {})
|
14
|
+
|
15
|
+
entity.check_hasnt_children
|
16
|
+
|
17
|
+
s = entity.to_s
|
18
|
+
|
19
|
+
@@tokenizer ||=
|
20
|
+
::StanfordCoreNLP.load(:tokenize)
|
21
|
+
text =
|
22
|
+
::StanfordCoreNLP::Text.new(s)
|
23
|
+
@@tokenizer.annotate(text)
|
24
|
+
|
25
|
+
add_tokens(entity, text.get(:tokens))
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
# Add the tokens to the entity.
|
30
|
+
def self.add_tokens(entity, tokens)
|
31
|
+
tokens.each do |token|
|
32
|
+
val = token.value
|
33
|
+
val = '(' if val == '-LRB-' # Fix for other special chars
|
34
|
+
val = ')' if val == '-RRB'
|
35
|
+
t = Treat::Entities::Token.
|
36
|
+
from_string(token.value)
|
37
|
+
entity << t
|
38
38
|
end
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
|
+
end
|
@@ -1,58 +1,67 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
1
|
+
# A tokenizer class lifted from the 'tactful-tokenizer' gem.
|
2
|
+
#
|
3
|
+
# Copyright © 2010 Matthew Bunday. All rights reserved.
|
4
|
+
# Released under the GNU GPL v3. Modified by Louis Mullie.
|
5
|
+
#
|
6
|
+
# Project website: https://github.com/SlyShy/Tactful_Tokenizer
|
7
|
+
class Treat::Processors::Tokenizers::Tactful
|
8
|
+
|
9
|
+
require 'treat/helpers/decimal_point_escaper'
|
10
|
+
|
11
|
+
ReTokenize = [
|
12
|
+
# Uniform Quotes
|
13
|
+
[/''|``/, '"'],
|
14
|
+
# Separate punctuation from words.
|
15
|
+
[/(^|\s)(')/, '\1\2'],
|
16
|
+
[/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
|
17
|
+
[/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
|
18
|
+
# Treat double-hyphen as a single token.
|
19
|
+
[/([^-])(--+)([^-])/, '\1 \2 \3'],
|
20
|
+
[/(\s|^)(,)(?=(\S))/, '\1\2 '],
|
21
|
+
# Only separate a comma if a space follows.
|
22
|
+
[/(.)(,)(\s|$)/, '\1 \2\3'],
|
23
|
+
# Combine dots separated by whitespace to be a single token.
|
24
|
+
[/\.\s\.\s\./, '...'],
|
25
|
+
# Separate "No.6"
|
26
|
+
[/([\W]\.)(\d+)/, '\1 \2'],
|
27
|
+
# Separate words from ellipses
|
28
|
+
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
|
29
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
|
30
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
|
31
|
+
##### Some additional fixes.
|
32
|
+
# Fix %, $, &
|
33
|
+
[/(\d)%/, '\1 %'],
|
34
|
+
[/\$(\.?\d)/, '$ \1'],
|
35
|
+
[/(\W)& (\W)/, '\1&\2'],
|
36
|
+
[/(\W\W+)&(\W\W+)/, '\1 & \2'],
|
37
|
+
# Fix (n 't) -> ( n't)
|
38
|
+
[/n 't( |$)/, " n't\\1"],
|
39
|
+
[/N 'T( |$)/, " N'T\\1"],
|
40
|
+
# Treebank tokenizer special words
|
41
|
+
[/([Cc])annot/, '\1an not']
|
42
|
+
|
43
|
+
]
|
44
|
+
|
45
|
+
|
46
|
+
# Tokenize the entity using a rule-based algorithm
|
47
|
+
# that has been lifted from the 'tactful-tokenizer'
|
48
|
+
# gem.
|
49
|
+
def self.tokenize(entity, options = {})
|
50
|
+
|
51
|
+
entity.check_hasnt_children
|
52
|
+
|
53
|
+
s = entity.to_s
|
54
|
+
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
55
|
+
|
56
|
+
ReTokenize.each do |rules|
|
57
|
+
s.gsub!(rules[0], rules[1])
|
56
58
|
end
|
59
|
+
|
60
|
+
s.split(' ').each do |token|
|
61
|
+
entity << Treat::Entities::Token.
|
62
|
+
from_string(token)
|
63
|
+
end
|
64
|
+
|
57
65
|
end
|
66
|
+
|
58
67
|
end
|
data/lib/treat/proxies.rb
CHANGED
@@ -1,40 +1,57 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
Treat::Entities::Unknown(self.to_s)
|
1
|
+
# Proxies install builders on core Ruby objects,
|
2
|
+
# so that methods called on them may be passed
|
3
|
+
# to the entity that can be built from the core
|
4
|
+
# class instance.
|
5
|
+
module Treat::Proxies
|
6
|
+
|
7
|
+
# Provides a base functionality for proxies.
|
8
|
+
module Proxy
|
9
|
+
|
10
|
+
# Build the entity corresponding to the proxied
|
11
|
+
# object and send the method call to the entity.
|
12
|
+
def method_missing(sym, *args, &block)
|
13
|
+
if sym == :do || Treat::Categories.lookup(sym)
|
14
|
+
to_entity.send(sym, *args)
|
15
|
+
else
|
16
|
+
super(sym, *args, &block)
|
18
17
|
end
|
19
18
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
def to_entity
|
25
|
-
Treat::Entities::Entity.from_string(self.to_s)
|
26
|
-
end
|
19
|
+
|
20
|
+
# Create an unknown type of entity by default.
|
21
|
+
def to_entity(builder = nil)
|
22
|
+
Treat::Entities::Unknown(self.to_s)
|
27
23
|
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
# Install Treat functions on String objects.
|
28
|
+
module String
|
29
|
+
|
30
|
+
# Include base proxy functionality.
|
31
|
+
include Treat::Proxies::Proxy
|
32
|
+
|
33
|
+
# Return the entity corresponding to the string.
|
34
|
+
def to_entity
|
35
|
+
Treat::Entities::Entity.from_string(self.to_s)
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
# Install Treat functions on Numeric objects.
|
41
|
+
module Numeric
|
42
|
+
|
43
|
+
# Include base proxy functionality.
|
44
|
+
include Treat::Proxies::Proxy
|
45
|
+
|
46
|
+
# Return the entity corresponding to the number.
|
47
|
+
def to_entity(builder = nil)
|
48
|
+
Treat::Entities::Number.from_numeric(self)
|
35
49
|
end
|
36
|
-
|
37
|
-
::String.class_eval { include Treat::Proxies::String }
|
38
|
-
::Numeric.class_eval { include Treat::Proxies::Numeric }
|
50
|
+
|
39
51
|
end
|
40
|
-
|
52
|
+
|
53
|
+
# Include the proxies in the core classes.
|
54
|
+
::String.class_eval { include Treat::Proxies::String }
|
55
|
+
::Numeric.class_eval { include Treat::Proxies::Numeric }
|
56
|
+
|
57
|
+
end
|
data/lib/treat/retrievers.rb
CHANGED
@@ -1,17 +1,27 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
self.type = :computer
|
12
|
-
self.targets = [:entity]
|
13
|
-
self.default = :ferret
|
14
|
-
end
|
15
|
-
extend Treat::Category
|
1
|
+
# Retrievers find documents in collections.
|
2
|
+
module Treat::Retrievers
|
3
|
+
|
4
|
+
# Indexers create an index of words used
|
5
|
+
# in the documents within a collection.
|
6
|
+
module Indexers
|
7
|
+
extend Treat::Groupable
|
8
|
+
self.type = :annotator
|
9
|
+
self.targets = [:collection]
|
10
|
+
self.default = :ferret
|
16
11
|
end
|
17
|
-
|
12
|
+
|
13
|
+
# Searchers perform full-text search
|
14
|
+
# on indexed collections in order
|
15
|
+
# to retrieve documents matching
|
16
|
+
# a query.
|
17
|
+
module Searchers
|
18
|
+
extend Treat::Groupable
|
19
|
+
self.type = :computer
|
20
|
+
self.targets = [:collection]
|
21
|
+
self.default = :ferret
|
22
|
+
end
|
23
|
+
|
24
|
+
# Make Retrievers categorizable.
|
25
|
+
extend Treat::Categorizable
|
26
|
+
|
27
|
+
end
|
@@ -1,28 +1,49 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
1
|
+
# A wrapper for the indexing functions of Ferret,
|
2
|
+
# a port of the Java Lucene search engine.
|
3
|
+
#
|
4
|
+
# Documentation:
|
5
|
+
# http://rubydoc.info/gems/ferret
|
6
|
+
class Treat::Retrievers::Indexers::Ferret
|
7
|
+
|
8
|
+
# Require Ferret and file utilities.
|
9
|
+
silence_warnings { require 'ferret' }
|
10
|
+
require 'find'
|
11
|
+
require 'fileutils'
|
12
|
+
|
13
|
+
# Create a Ferret index for the collection and
|
14
|
+
# store the index in the collection, under the
|
15
|
+
# path collection-folder/.index
|
16
|
+
#
|
17
|
+
# Annotates the collection with the path to the
|
18
|
+
# index for future use (e.g. in searching).
|
19
|
+
def self.index(collection, options = {})
|
20
|
+
|
21
|
+
path = "#{collection.folder}/.index"
|
22
|
+
return path if FileTest.directory?(path)
|
23
|
+
|
24
|
+
begin
|
25
|
+
FileUtils.mkdir(path)
|
26
|
+
rescue Exception => e
|
27
|
+
raise Treat::Exception,
|
28
|
+
"Could not create folder for index " +
|
29
|
+
"under the collection's folder. " +
|
30
|
+
"(#{e.message})."
|
26
31
|
end
|
32
|
+
|
33
|
+
index = ::Ferret::Index::Index.new(
|
34
|
+
:default_field => 'content',
|
35
|
+
:path => path
|
36
|
+
)
|
37
|
+
|
38
|
+
collection.each_document do |doc|
|
39
|
+
index.add_document(
|
40
|
+
:file => doc.file,
|
41
|
+
:content => doc.to_s
|
42
|
+
)
|
43
|
+
end
|
44
|
+
|
45
|
+
path
|
46
|
+
|
27
47
|
end
|
28
|
-
|
48
|
+
|
49
|
+
end
|
@@ -1,53 +1,72 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
1
|
+
# A simple interface to the Ferret information
|
2
|
+
# retrieval library, which performs full-text
|
3
|
+
# search within documents of a collection.
|
4
|
+
#
|
5
|
+
# Documentation:
|
6
|
+
# http://rubydoc.info/gems/ferret
|
7
|
+
class Treat::Retrievers::Searchers::Ferret
|
8
|
+
|
9
|
+
silence_warnings { require 'ferret' }
|
10
|
+
require 'find'
|
11
|
+
|
12
|
+
DefaultOptions = {
|
13
|
+
:q => nil,
|
14
|
+
:limit => :all,
|
15
|
+
:callback => nil
|
16
|
+
}
|
17
|
+
|
18
|
+
# Returns an array of retrieved documents.
|
19
|
+
#
|
20
|
+
# Options:
|
21
|
+
#
|
22
|
+
# - (String) :q => a search query.
|
23
|
+
# - (Symbol) :limit => number of documents.
|
24
|
+
def self.search(collection, options = {})
|
25
|
+
|
26
|
+
options = DefaultOptions.merge(options)
|
27
|
+
|
28
|
+
unless collection.has?(:index)
|
29
|
+
raise Treat::Exception,
|
30
|
+
"This collection must be indexed to be searchable."
|
31
|
+
end
|
32
|
+
|
33
|
+
unless options[:q]
|
34
|
+
raise Treat::Exception,
|
35
|
+
'You must set a query by using the :q option.'
|
36
|
+
end
|
37
|
+
|
38
|
+
path = collection.index
|
39
|
+
|
40
|
+
unless FileTest.directory?(path)
|
41
|
+
raise Treat::Exception,
|
42
|
+
"The index at location #{path} cannot be found."
|
43
|
+
end
|
44
|
+
|
45
|
+
index = ::Ferret::Index::Index.new(
|
46
|
+
:default_field => 'content',
|
47
|
+
:path => path
|
48
|
+
)
|
49
|
+
|
50
|
+
query = options.delete(:q)
|
51
|
+
files = {}
|
52
|
+
index.search_each(query, options) do |doc, score|
|
53
|
+
files[index[doc]['file']] = score
|
54
|
+
end
|
55
|
+
|
56
|
+
docs = []
|
57
|
+
files.each do |doc, score|
|
58
|
+
doc2 = collection.document_with_file(doc)
|
59
|
+
unless doc2
|
60
|
+
raise Treat::Exception,
|
61
|
+
"Couldn't retrieve indexed " +
|
62
|
+
"document with filename #{doc}."
|
63
|
+
end
|
64
|
+
if options[:callback]
|
65
|
+
options[:callback].call(doc2, score)
|
50
66
|
end
|
67
|
+
docs << doc2
|
51
68
|
end
|
69
|
+
|
70
|
+
docs
|
52
71
|
end
|
53
|
-
end
|
72
|
+
end
|