treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,115 @@
|
|
1
|
+
# This module provides configuration options for the Treat toolkit
|
2
|
+
# (enable/disable syntactic sugar, enable/disable language detection
|
3
|
+
# and set default language or language detection level.
|
4
|
+
module Treat::Configurable
|
5
|
+
|
6
|
+
# Modify the singleton class of the base module (Treat).
|
7
|
+
def self.extended(base)
|
8
|
+
|
9
|
+
# Configuration options that are available for the Treat module.
|
10
|
+
class << base
|
11
|
+
# Symbol - default language to use when detect_language is false.
|
12
|
+
attr_accessor :default_language
|
13
|
+
# Boolean - detect language or use default?
|
14
|
+
attr_accessor :detect_language
|
15
|
+
# Symbol - the finest entity level at which to detect language.
|
16
|
+
attr_accessor :language_detection_level
|
17
|
+
# Boolean - whether to output debug information or not.
|
18
|
+
attr_accessor :debug
|
19
|
+
# Boolean - whether to silence the output of external programs.
|
20
|
+
attr_accessor :silence
|
21
|
+
end
|
22
|
+
|
23
|
+
# Set the default options.
|
24
|
+
base.module_eval do
|
25
|
+
# Set the default language to english.
|
26
|
+
self.default_language = :eng
|
27
|
+
# Turn language detection off by default.
|
28
|
+
self.detect_language = false
|
29
|
+
# Detect the language once per document by default.
|
30
|
+
self.language_detection_level = :document
|
31
|
+
# Set debug to off by default.
|
32
|
+
self.debug = false
|
33
|
+
# Silence external programs by default.
|
34
|
+
self.silence = true
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
# Turn on syntactic sugar for the creation of Entities.
|
40
|
+
#
|
41
|
+
# All entities found under Treat::Entities will be made
|
42
|
+
# available within the global namespace. As an example,
|
43
|
+
# 'Treat::Entities::Word' can then be referred to as 'Word'.
|
44
|
+
#
|
45
|
+
# There is one exception: the Symbol class is not sweetened
|
46
|
+
# to avoid clashing with the Symbol class defined by Ruby.
|
47
|
+
def sweeten!
|
48
|
+
return if @@sweetened
|
49
|
+
@@sweetened = true
|
50
|
+
each_entity_class do |type, klass|
|
51
|
+
Object.class_eval do
|
52
|
+
unless type == :Symbol
|
53
|
+
define_method(type) do |file_or_value, options={}|
|
54
|
+
klass.build(file_or_value, options)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Turn off syntactic sugar.
|
62
|
+
def unsweeten!
|
63
|
+
return unless @@sweetened
|
64
|
+
@@sweetened = false
|
65
|
+
each_entity_class do |type, klass|
|
66
|
+
Object.class_eval do
|
67
|
+
remove_method(type)
|
68
|
+
end unless type == :Symbol
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Boolean - whether syntactic sugar is
|
73
|
+
# enabled or not.
|
74
|
+
def sweetened?; @@sweetened; end
|
75
|
+
|
76
|
+
# Syntactic sugar is disabled by default.
|
77
|
+
@@sweetened = false
|
78
|
+
|
79
|
+
# Turn on language detection, optionally setting
|
80
|
+
# the language detection level (finest level at
|
81
|
+
# which language detection is performed).
|
82
|
+
def self.detect!(level = nil)
|
83
|
+
self.detect_language = true
|
84
|
+
if level
|
85
|
+
self.language_detection_level = level
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Turn off language detection, optionally setting
|
90
|
+
# a new default language to use.
|
91
|
+
def self.undetect!(default = :english)
|
92
|
+
self.detect_language = false
|
93
|
+
if default
|
94
|
+
self.default_language = default
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Use the supplied language by default and
|
99
|
+
# turn off language detection.
|
100
|
+
def self.use(language)
|
101
|
+
self.detect_language = false
|
102
|
+
self.default_language = language
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
# Helper method, yields each entity type and class.
|
107
|
+
def each_entity_class
|
108
|
+
Treat::Entities.list.each do |entity_type|
|
109
|
+
type = cc(entity_type).intern
|
110
|
+
klass = Treat::Entities.const_get(type, klass)
|
111
|
+
yield type, klass
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
class Treat::DataSet
|
2
|
+
|
3
|
+
require 'psych'
|
4
|
+
require 'treat/classification'
|
5
|
+
|
6
|
+
attr_reader :classification
|
7
|
+
attr_reader :labels
|
8
|
+
attr_reader :items
|
9
|
+
attr_reader :ids
|
10
|
+
|
11
|
+
def self.open(file)
|
12
|
+
unless File.readable?(file)
|
13
|
+
raise Treat::Exception,
|
14
|
+
"Cannot load data set "+
|
15
|
+
"from #{file} because " +
|
16
|
+
"it doesn't exist."
|
17
|
+
end
|
18
|
+
::Psych.load(
|
19
|
+
File.read(file))
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(classification)
|
23
|
+
@classification = classification
|
24
|
+
@labels = classification.labels
|
25
|
+
@items = []
|
26
|
+
@ids = []
|
27
|
+
end
|
28
|
+
|
29
|
+
def <<(entity)
|
30
|
+
@items <<
|
31
|
+
@classification.
|
32
|
+
export_item(entity)
|
33
|
+
@ids << entity.id
|
34
|
+
end
|
35
|
+
|
36
|
+
def save(file)
|
37
|
+
File.open(file, 'w') do |f|
|
38
|
+
f.write(::Psych.dump(self))
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class Treat::Dependencies
|
2
|
+
|
3
|
+
Gem = [
|
4
|
+
['psych', '1.2.2', '(un)serialize annotated entities to YAML format'],
|
5
|
+
['nokogiri', '>= 1.4.0', 'read and parse XML and HTML formats'],
|
6
|
+
['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
|
7
|
+
['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
|
8
|
+
['ruby-readability', '>= 0.5.0', 'extract the readable content from HTML pages'],
|
9
|
+
['stanford-core-nlp', '>= 0.1.8', 'tokenize, segment, parse texts and perform named entity recognition'],
|
10
|
+
['whatlanguage', '>= 1.0.0', 'detect the language of text'],
|
11
|
+
['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
|
12
|
+
['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
|
13
|
+
['chronic', '>= 0.6.7', 'detect date and time in text'],
|
14
|
+
['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities']
|
15
|
+
]
|
16
|
+
|
17
|
+
Binary = [
|
18
|
+
['ocropus', 'recognize text in image files'],
|
19
|
+
['antiword', 'extract text from DOC files'],
|
20
|
+
['poppler-utils', 'extract text from PDF files'],
|
21
|
+
['graphviz', 'export and visualize directed graphs']
|
22
|
+
]
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# Download a file without storing it entirely in memory.
|
2
|
+
class Treat::Downloader
|
3
|
+
|
4
|
+
require 'net/http'
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_accessor :show_progress
|
9
|
+
end
|
10
|
+
|
11
|
+
self.show_progress = false
|
12
|
+
|
13
|
+
MaxTries = 3
|
14
|
+
|
15
|
+
# Download a file into destination, and return
|
16
|
+
# the path to the downloaded file. If the filename
|
17
|
+
# is nil, it will set the default filename to 'top'.
|
18
|
+
def self.download(protocol, server, dir, file = nil, target_base = nil, target_dir = nil)
|
19
|
+
|
20
|
+
require 'progressbar' if self.show_progress
|
21
|
+
|
22
|
+
target_base ||= Treat.files
|
23
|
+
target_dir ||= server
|
24
|
+
|
25
|
+
dir += '/' if dir && dir[-1] != '/'
|
26
|
+
resource = "#{dir}#{file}"
|
27
|
+
resource = "/#{resource}" unless resource[0] == '/'
|
28
|
+
url = "#{server}#{resource}"
|
29
|
+
path = File.join(target_base, target_dir)
|
30
|
+
|
31
|
+
unless FileTest.directory?(path)
|
32
|
+
FileUtils.mkdir(path)
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
file = File.open("#{path}/#{file}", 'w')
|
37
|
+
tries = 0
|
38
|
+
begin
|
39
|
+
|
40
|
+
Net::HTTP.start(server) do |http|
|
41
|
+
|
42
|
+
http.use_ssl = true if protocol == 'https'
|
43
|
+
|
44
|
+
http.request_get(resource) do |response|
|
45
|
+
|
46
|
+
if response.content_length
|
47
|
+
length = response.content_length
|
48
|
+
else
|
49
|
+
warn 'Unknown file size; ETR unknown.'
|
50
|
+
length = 10000
|
51
|
+
end
|
52
|
+
|
53
|
+
pbar = self.show_progress ?
|
54
|
+
ProgressBar.new(url, length) : nil
|
55
|
+
|
56
|
+
unless response.code == '200'
|
57
|
+
raise Treat::Exception,
|
58
|
+
"response code was not 200 "+
|
59
|
+
"OK, but was #{response.code}. "
|
60
|
+
end
|
61
|
+
|
62
|
+
response.read_body do |segment|
|
63
|
+
pbar.inc(segment.length) if pbar
|
64
|
+
file.write(segment)
|
65
|
+
end
|
66
|
+
|
67
|
+
pbar.finish if pbar
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
file.path.to_s
|
74
|
+
|
75
|
+
rescue Exception => error
|
76
|
+
tries += 1
|
77
|
+
retry if tries < MaxTries
|
78
|
+
raise Treat::Exception,
|
79
|
+
"Couldn't download #{url}. (#{error.message})"
|
80
|
+
file.delete
|
81
|
+
ensure
|
82
|
+
file.close
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
data/lib/treat/entities.rb
CHANGED
@@ -1,74 +1,76 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
# Provide a list of
|
16
|
-
# as
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
1
|
+
# Entities are Tree structures that represent textual entities
|
2
|
+
# (from a collection of texts down to an individual word) with
|
3
|
+
# a unique identifier, a value, features, children and dependencies
|
4
|
+
# linking them to other textual entities.
|
5
|
+
#
|
6
|
+
# - A Collection represents a folder containing documents (and folders).
|
7
|
+
# - A Document represents a file with a textual content.
|
8
|
+
# - A Zone represents a logical division of content in a document.
|
9
|
+
# - A Phrase is a group of words; a Sentence is a Phrase with an ender.
|
10
|
+
# - A Token represents a Word, a Number, a Punctuation or a Symbol.
|
11
|
+
module Treat::Entities
|
12
|
+
|
13
|
+
# Variables for the singleton class.
|
14
|
+
class << self
|
15
|
+
# Provide a list of all entity types except Entity,
|
16
|
+
# as non_camel_case identifiers.
|
17
|
+
attr_accessor :list
|
18
|
+
end
|
19
|
+
|
20
|
+
# Require all entities.
|
21
|
+
require 'treat/entities/entities'
|
22
|
+
|
23
|
+
# Add each constant to the list, except Entity.
|
24
|
+
self.list = []
|
25
|
+
constants.each do |constant|
|
26
|
+
unless constant == :Entity ||
|
27
|
+
constant == :Abilities
|
28
|
+
self.list << ucc(constant).intern
|
26
29
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
# Make the constants buildable.
|
36
|
-
constants.each do |entity|
|
37
|
-
define_singleton_method(entity) do |value='', id=nil|
|
38
|
-
const_get(entity).build(value, id)
|
39
|
-
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Make each Entity class buildable magically.
|
33
|
+
# This enables to create Entities without calling
|
34
|
+
# #new (e.g. Word 'hello').
|
35
|
+
constants.each do |entity|
|
36
|
+
define_singleton_method(entity) do |value='', id=nil|
|
37
|
+
const_get(entity).build(value, id)
|
40
38
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
39
|
+
end
|
40
|
+
|
41
|
+
# Create entity lookup table.
|
42
|
+
@@match_types = nil
|
43
|
+
def self.match_types
|
44
|
+
return @@match_types if @@match_types
|
45
|
+
list = (Treat::Entities.list + [:entity])
|
46
|
+
@@match_types = {}
|
47
|
+
list.each do |type1|
|
48
|
+
list.each do |type2|
|
49
|
+
@@match_types[type2] ||= {}
|
50
|
+
if (type1 == type2) ||
|
51
|
+
(Treat::Entities.const_get(cc(type1)) <
|
52
|
+
Treat::Entities.const_get(cc(type2)))
|
53
|
+
@@match_types[type2][type1] = true
|
56
54
|
end
|
57
55
|
end
|
58
|
-
@@match_types
|
59
56
|
end
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
57
|
+
@@match_types
|
58
|
+
end
|
59
|
+
|
60
|
+
# A bottom-up ordering of general types of entities.
|
61
|
+
@@order = [Token, Phrase, Zone, Document, Collection]
|
62
|
+
|
63
|
+
# Return the hierarchy level of the entity
|
64
|
+
# class, the minimum being a Token and the
|
65
|
+
# maximum being a Collection.
|
66
|
+
#
|
67
|
+
# Implement as true comparison functions.
|
68
|
+
def self.rank(type)
|
69
|
+
klass = Treat::Entities.const_get(cc(type))
|
70
|
+
compare = lambda { |a,b| a == b || a < b }
|
71
|
+
1.upto(@@order.size) do |i|
|
72
|
+
return i if compare.call(klass, @@order[i])
|
72
73
|
end
|
73
74
|
end
|
75
|
+
|
74
76
|
end
|
@@ -0,0 +1,327 @@
|
|
1
|
+
# Represents an object that can be built
|
2
|
+
# from a folder of files, a specific file,
|
3
|
+
# a string or a numeric object. This class
|
4
|
+
# is pretty much self-explanatory.
|
5
|
+
module Treat::Entities::Abilities::Buildable
|
6
|
+
|
7
|
+
require 'treat/helpers/decimal_point_escaper'
|
8
|
+
require 'fileutils'
|
9
|
+
|
10
|
+
# Simple regexps to match common entities.
|
11
|
+
WordRegexp = /^[[:alpha:]\-']+$/
|
12
|
+
NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
|
13
|
+
PunctRegexp = /^[[:punct:]\$]+$/
|
14
|
+
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
15
|
+
EmailRegexp = /.+\@.+\..+/
|
16
|
+
|
17
|
+
# Reserved folder names
|
18
|
+
Reserved = ['.index']
|
19
|
+
|
20
|
+
# Build an entity from anything (can be
|
21
|
+
# a string, numeric,folder, or file name
|
22
|
+
# representing a raw or serialized file).
|
23
|
+
def build(file_or_value, options = {})
|
24
|
+
|
25
|
+
fv = file_or_value.to_s
|
26
|
+
if self == Treat::Entities::Document
|
27
|
+
if fv =~ UriRegexp
|
28
|
+
from_url(fv, options)
|
29
|
+
else
|
30
|
+
from_file(fv, options)
|
31
|
+
end
|
32
|
+
elsif self == Treat::Entities::Collection
|
33
|
+
if FileTest.directory?(fv)
|
34
|
+
from_folder(fv, options)
|
35
|
+
else
|
36
|
+
create_collection(fv)
|
37
|
+
end
|
38
|
+
else
|
39
|
+
if file_or_value.is_a?(String)
|
40
|
+
from_string(file_or_value)
|
41
|
+
elsif file_or_value.is_a?(Numeric)
|
42
|
+
from_numeric(file_or_value)
|
43
|
+
else
|
44
|
+
raise Treat::Exception,
|
45
|
+
"Unrecognizable input '#{fv}'. "+
|
46
|
+
"Please supply a folder, " +
|
47
|
+
"filename, string or number."
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
# Build an entity from a string. Type is
|
54
|
+
# enforced only if requested or if the entity
|
55
|
+
# is user-created (i.e. by calling build
|
56
|
+
# instead of from_string directly).
|
57
|
+
def from_string(string, enforce_type = false)
|
58
|
+
|
59
|
+
Treat::Helpers::DecimalPointEscaper.escape!(string)
|
60
|
+
|
61
|
+
enforce_type = true if caller_method == :build
|
62
|
+
|
63
|
+
unless self == Treat::Entities::Entity
|
64
|
+
return self.new(string) if enforce_type
|
65
|
+
end
|
66
|
+
|
67
|
+
e = anything_from_string(string)
|
68
|
+
|
69
|
+
if enforce_type && !e.is_a?(self)
|
70
|
+
raise "Asked to build a #{cl(self).downcase} "+
|
71
|
+
"from \"#{string}\" and to enforce type, "+
|
72
|
+
"but type detected was #{cl(e.class).downcase}."
|
73
|
+
end
|
74
|
+
|
75
|
+
e
|
76
|
+
end
|
77
|
+
|
78
|
+
# Build a document from an URL.
|
79
|
+
def from_url(url, options)
|
80
|
+
unless self ==
|
81
|
+
Treat::Entities::Document
|
82
|
+
raise Treat::Exception,
|
83
|
+
'Cannot create something ' +
|
84
|
+
'else than a document from a url.'
|
85
|
+
end
|
86
|
+
|
87
|
+
uri = ::URI.parse(url)
|
88
|
+
|
89
|
+
sp = uri.path.split('/')
|
90
|
+
sp.shift if sp[0] == ''
|
91
|
+
|
92
|
+
file = sp[-1]
|
93
|
+
path = sp.size == 1 ?
|
94
|
+
'/' : sp[0..-2].join('/')
|
95
|
+
|
96
|
+
f = Treat::Downloader.download(
|
97
|
+
uri.scheme, uri.host, path, file)
|
98
|
+
options[:_default_format] = :html
|
99
|
+
|
100
|
+
e = from_file(f, options)
|
101
|
+
e.set :url, url
|
102
|
+
e
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
# Build an entity from a Numeric object.
|
107
|
+
def from_numeric(numeric)
|
108
|
+
unless (self ==
|
109
|
+
Treat::Entities::Number) ||
|
110
|
+
(self == Treat::Entities::Token) ||
|
111
|
+
(self == Treat::Entities::Entity)
|
112
|
+
raise Treat::Exception,
|
113
|
+
"Cannot create something " +
|
114
|
+
"else than a number/token from " +
|
115
|
+
"a numeric object."
|
116
|
+
end
|
117
|
+
n = numeric.to_s
|
118
|
+
Treat::Helpers::DecimalPointEscaper.unescape!(n)
|
119
|
+
Treat::Entities::Number.new(n)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Build an entity from a folder with documents.
|
123
|
+
# Folders will be searched recursively.
|
124
|
+
def from_folder(folder, options)
|
125
|
+
|
126
|
+
return if Reserved.include?(folder)
|
127
|
+
|
128
|
+
unless FileTest.directory?(folder)
|
129
|
+
raise Treat::Exception,
|
130
|
+
"Path '#{folder}' does " +
|
131
|
+
"not point to a folder."
|
132
|
+
end
|
133
|
+
|
134
|
+
unless File.readable?(folder)
|
135
|
+
raise Treat::Exception,
|
136
|
+
"Folder '#{folder}' is not readable."
|
137
|
+
end
|
138
|
+
|
139
|
+
unless self ==
|
140
|
+
Treat::Entities::Collection
|
141
|
+
raise Treat::Exception,
|
142
|
+
"Cannot create something " +
|
143
|
+
"else than a collection " +
|
144
|
+
"from folder '#{folder}'."
|
145
|
+
end
|
146
|
+
|
147
|
+
c = Treat::Entities::Collection.new(folder)
|
148
|
+
folder += '/' unless folder[-1] == '/'
|
149
|
+
|
150
|
+
Dir[folder + '*'].each do |f|
|
151
|
+
if FileTest.directory?(f)
|
152
|
+
c2 = Treat::Entities::Collection.
|
153
|
+
from_folder(f, options)
|
154
|
+
c.<<(c2, false) if c2
|
155
|
+
else
|
156
|
+
c.<<(Treat::Entities::Document.
|
157
|
+
from_file(f, options), false)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
c
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
# Build a document from a raw or serialized file.
|
165
|
+
def from_file(file, options)
|
166
|
+
|
167
|
+
unless File.readable?(file)
|
168
|
+
raise Treat::Exception,
|
169
|
+
"Path '#{file}' does not "+
|
170
|
+
"point to a readable file."
|
171
|
+
end
|
172
|
+
|
173
|
+
dflt = options[:_default_format]
|
174
|
+
fmt = Treat::Formatters::Readers::Autoselect.
|
175
|
+
detect_format(file, dflt)
|
176
|
+
options[:_format] = fmt
|
177
|
+
|
178
|
+
if fmt == :yaml || fmt == :yml ||
|
179
|
+
(fmt == :xml && is_treat_xml?(file))
|
180
|
+
f = from_serialized_file(file, options)
|
181
|
+
else
|
182
|
+
f = from_raw_file(file, options)
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
# Build a document from a raw file.
|
188
|
+
def from_raw_file(file, options)
|
189
|
+
|
190
|
+
unless self ==
|
191
|
+
Treat::Entities::Document
|
192
|
+
raise Treat::Exception,
|
193
|
+
"Cannot create something else than a " +
|
194
|
+
"document from raw file '#{file}'."
|
195
|
+
end
|
196
|
+
|
197
|
+
d = Treat::Entities::Document.new(file)
|
198
|
+
|
199
|
+
d.read(:autoselect, options)
|
200
|
+
|
201
|
+
end
|
202
|
+
|
203
|
+
# Build an entity from a serialized file.
|
204
|
+
def from_serialized_file(file, options)
|
205
|
+
|
206
|
+
d = Treat::Entities::Document.new(file)
|
207
|
+
d.unserialize(:autoselect, options)
|
208
|
+
d.children[0].set_as_root!
|
209
|
+
d.children[0]
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
# Build any kind of entity from a string.
|
214
|
+
def anything_from_string(string)
|
215
|
+
|
216
|
+
case cl(self).downcase.intern
|
217
|
+
when :document, :collection
|
218
|
+
raise Treat::Exception,
|
219
|
+
"Cannot create a document or " +
|
220
|
+
"collection from a string " +
|
221
|
+
"(need a readable file/folder)."
|
222
|
+
when :phrase
|
223
|
+
phrase_from_string(string)
|
224
|
+
when :token
|
225
|
+
token_from_string(string)
|
226
|
+
when :zone
|
227
|
+
zone_from_string(string)
|
228
|
+
when :entity
|
229
|
+
if string.count(' ') == 0
|
230
|
+
token_from_string(string)
|
231
|
+
else
|
232
|
+
if string.gsub(/[\.\!\?]+/,
|
233
|
+
'.').count('.') <= 1 &&
|
234
|
+
string.count("\n") == 0
|
235
|
+
phrase_from_string(string)
|
236
|
+
else
|
237
|
+
zone_from_string(string)
|
238
|
+
end
|
239
|
+
end
|
240
|
+
else
|
241
|
+
self.new(string)
|
242
|
+
end
|
243
|
+
|
244
|
+
end
|
245
|
+
|
246
|
+
def check_encoding(string)
|
247
|
+
string.encode("UTF-8", undef: :replace) # Fix
|
248
|
+
end
|
249
|
+
|
250
|
+
# Build a phrase from a string.
|
251
|
+
def phrase_from_string(string)
|
252
|
+
|
253
|
+
check_encoding(string)
|
254
|
+
|
255
|
+
if string.count('.!?') >= 1
|
256
|
+
Treat::Entities::Sentence.new(string)
|
257
|
+
else
|
258
|
+
Treat::Entities::Phrase.new(string)
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|
262
|
+
|
263
|
+
# Build the right type of token
|
264
|
+
# corresponding to a string.
|
265
|
+
def token_from_string(string)
|
266
|
+
|
267
|
+
check_encoding(string)
|
268
|
+
if string == "'s" || string == "'S"
|
269
|
+
Treat::Entities::Clitic.new(string)
|
270
|
+
elsif string =~ WordRegexp &&
|
271
|
+
string.count(' ') == 0 &&
|
272
|
+
string != '-'
|
273
|
+
Treat::Entities::Word.new(string)
|
274
|
+
elsif string =~ NumberRegexp
|
275
|
+
from_numeric(string)
|
276
|
+
elsif string =~ PunctRegexp
|
277
|
+
Treat::Entities::Punctuation.new(string)
|
278
|
+
elsif string.count('.') > 0 &&
|
279
|
+
string =~ UriRegexp
|
280
|
+
Treat::Entities::Url.new(string)
|
281
|
+
elsif string.count('@') > 0 &&
|
282
|
+
string =~ EmailRegexp
|
283
|
+
Treat::Entities::Email.new(string)
|
284
|
+
else
|
285
|
+
Treat::Entities::Symbol.new(string)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# Build the right type of zone
|
290
|
+
# corresponding to the string.
|
291
|
+
|
292
|
+
def zone_from_string(string)
|
293
|
+
|
294
|
+
check_encoding(string)
|
295
|
+
dot = string.count('.!?')
|
296
|
+
if dot && dot >= 1 && string.count("\n") > 0
|
297
|
+
Treat::Entities::Section.new(string)
|
298
|
+
elsif string.count('.') == 0 &&
|
299
|
+
string.size < 45
|
300
|
+
Treat::Entities::Title.new(string)
|
301
|
+
else
|
302
|
+
Treat::Entities::Paragraph.new(string)
|
303
|
+
end
|
304
|
+
|
305
|
+
end
|
306
|
+
|
307
|
+
# Eventually find a better way.
|
308
|
+
def is_treat_xml?(file)
|
309
|
+
|
310
|
+
beginning = nil
|
311
|
+
|
312
|
+
File.open(file) do |w|
|
313
|
+
beginning = w.readlines(200)
|
314
|
+
end
|
315
|
+
|
316
|
+
beginning = beginning.join(' ')
|
317
|
+
beginning.count('<treat>') > 0
|
318
|
+
|
319
|
+
end
|
320
|
+
|
321
|
+
def create_collection(fv)
|
322
|
+
debug("Creating new collection in directory #{fv}.")
|
323
|
+
FileUtils.mkdir(fv)
|
324
|
+
Treat::Entities::Collection.new(fv)
|
325
|
+
end
|
326
|
+
|
327
|
+
end
|