treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,115 @@
1
+ # This module provides configuration options for the Treat toolkit
2
+ # (enable/disable syntactic sugar, enable/disable language detection
3
+ # and set default language or language detection level.
4
+ module Treat::Configurable
5
+
6
+ # Modify the singleton class of the base module (Treat).
7
+ def self.extended(base)
8
+
9
+ # Configuration options that are available for the Treat module.
10
+ class << base
11
+ # Symbol - default language to use when detect_language is false.
12
+ attr_accessor :default_language
13
+ # Boolean - detect language or use default?
14
+ attr_accessor :detect_language
15
+ # Symbol - the finest entity level at which to detect language.
16
+ attr_accessor :language_detection_level
17
+ # Boolean - whether to output debug information or not.
18
+ attr_accessor :debug
19
+ # Boolean - whether to silence the output of external programs.
20
+ attr_accessor :silence
21
+ end
22
+
23
+ # Set the default options.
24
+ base.module_eval do
25
+ # Set the default language to english.
26
+ self.default_language = :eng
27
+ # Turn language detection off by default.
28
+ self.detect_language = false
29
+ # Detect the language once per document by default.
30
+ self.language_detection_level = :document
31
+ # Set debug to off by default.
32
+ self.debug = false
33
+ # Silence external programs by default.
34
+ self.silence = true
35
+ end
36
+
37
+ end
38
+
39
+ # Turn on syntactic sugar for the creation of Entities.
40
+ #
41
+ # All entities found under Treat::Entities will be made
42
+ # available within the global namespace. As an example,
43
+ # 'Treat::Entities::Word' can then be referred to as 'Word'.
44
+ #
45
+ # There is one exception: the Symbol class is not sweetened
46
+ # to avoid clashing with the Symbol class defined by Ruby.
47
+ def sweeten!
48
+ return if @@sweetened
49
+ @@sweetened = true
50
+ each_entity_class do |type, klass|
51
+ Object.class_eval do
52
+ unless type == :Symbol
53
+ define_method(type) do |file_or_value, options={}|
54
+ klass.build(file_or_value, options)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ # Turn off syntactic sugar.
62
+ def unsweeten!
63
+ return unless @@sweetened
64
+ @@sweetened = false
65
+ each_entity_class do |type, klass|
66
+ Object.class_eval do
67
+ remove_method(type)
68
+ end unless type == :Symbol
69
+ end
70
+ end
71
+
72
+ # Boolean - whether syntactic sugar is
73
+ # enabled or not.
74
+ def sweetened?; @@sweetened; end
75
+
76
+ # Syntactic sugar is disabled by default.
77
+ @@sweetened = false
78
+
79
+ # Turn on language detection, optionally setting
80
+ # the language detection level (finest level at
81
+ # which language detection is performed).
82
+ def self.detect!(level = nil)
83
+ self.detect_language = true
84
+ if level
85
+ self.language_detection_level = level
86
+ end
87
+ end
88
+
89
+ # Turn off language detection, optionally setting
90
+ # a new default language to use.
91
+ def self.undetect!(default = :english)
92
+ self.detect_language = false
93
+ if default
94
+ self.default_language = default
95
+ end
96
+ end
97
+
98
+ # Use the supplied language by default and
99
+ # turn off language detection.
100
+ def self.use(language)
101
+ self.detect_language = false
102
+ self.default_language = language
103
+ end
104
+
105
+ private
106
+ # Helper method, yields each entity type and class.
107
+ def each_entity_class
108
+ Treat::Entities.list.each do |entity_type|
109
+ type = cc(entity_type).intern
110
+ klass = Treat::Entities.const_get(type, klass)
111
+ yield type, klass
112
+ end
113
+ end
114
+
115
+ end
@@ -0,0 +1,42 @@
1
+ class Treat::DataSet
2
+
3
+ require 'psych'
4
+ require 'treat/classification'
5
+
6
+ attr_reader :classification
7
+ attr_reader :labels
8
+ attr_reader :items
9
+ attr_reader :ids
10
+
11
+ def self.open(file)
12
+ unless File.readable?(file)
13
+ raise Treat::Exception,
14
+ "Cannot load data set "+
15
+ "from #{file} because " +
16
+ "it doesn't exist."
17
+ end
18
+ ::Psych.load(
19
+ File.read(file))
20
+ end
21
+
22
+ def initialize(classification)
23
+ @classification = classification
24
+ @labels = classification.labels
25
+ @items = []
26
+ @ids = []
27
+ end
28
+
29
+ def <<(entity)
30
+ @items <<
31
+ @classification.
32
+ export_item(entity)
33
+ @ids << entity.id
34
+ end
35
+
36
+ def save(file)
37
+ File.open(file, 'w') do |f|
38
+ f.write(::Psych.dump(self))
39
+ end
40
+ end
41
+
42
+ end
@@ -0,0 +1,24 @@
1
+ class Treat::Dependencies
2
+
3
+ Gem = [
4
+ ['psych', '1.2.2', '(un)serialize annotated entities to YAML format'],
5
+ ['nokogiri', '>= 1.4.0', 'read and parse XML and HTML formats'],
6
+ ['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
7
+ ['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
8
+ ['ruby-readability', '>= 0.5.0', 'extract the readable content from HTML pages'],
9
+ ['stanford-core-nlp', '>= 0.1.8', 'tokenize, segment, parse texts and perform named entity recognition'],
10
+ ['whatlanguage', '>= 1.0.0', 'detect the language of text'],
11
+ ['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
12
+ ['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
13
+ ['chronic', '>= 0.6.7', 'detect date and time in text'],
14
+ ['decisiontree', '>= 0.3.0', 'perform decision tree classification of text entities']
15
+ ]
16
+
17
+ Binary = [
18
+ ['ocropus', 'recognize text in image files'],
19
+ ['antiword', 'extract text from DOC files'],
20
+ ['poppler-utils', 'extract text from PDF files'],
21
+ ['graphviz', 'export and visualize directed graphs']
22
+ ]
23
+
24
+ end
@@ -0,0 +1,87 @@
1
+ # Download a file without storing it entirely in memory.
2
+ class Treat::Downloader
3
+
4
+ require 'net/http'
5
+ require 'fileutils'
6
+
7
+ class << self
8
+ attr_accessor :show_progress
9
+ end
10
+
11
+ self.show_progress = false
12
+
13
+ MaxTries = 3
14
+
15
+ # Download a file into destination, and return
16
+ # the path to the downloaded file. If the filename
17
+ # is nil, it will set the default filename to 'top'.
18
+ def self.download(protocol, server, dir, file = nil, target_base = nil, target_dir = nil)
19
+
20
+ require 'progressbar' if self.show_progress
21
+
22
+ target_base ||= Treat.files
23
+ target_dir ||= server
24
+
25
+ dir += '/' if dir && dir[-1] != '/'
26
+ resource = "#{dir}#{file}"
27
+ resource = "/#{resource}" unless resource[0] == '/'
28
+ url = "#{server}#{resource}"
29
+ path = File.join(target_base, target_dir)
30
+
31
+ unless FileTest.directory?(path)
32
+ FileUtils.mkdir(path)
33
+ end
34
+
35
+
36
+ file = File.open("#{path}/#{file}", 'w')
37
+ tries = 0
38
+ begin
39
+
40
+ Net::HTTP.start(server) do |http|
41
+
42
+ http.use_ssl = true if protocol == 'https'
43
+
44
+ http.request_get(resource) do |response|
45
+
46
+ if response.content_length
47
+ length = response.content_length
48
+ else
49
+ warn 'Unknown file size; ETR unknown.'
50
+ length = 10000
51
+ end
52
+
53
+ pbar = self.show_progress ?
54
+ ProgressBar.new(url, length) : nil
55
+
56
+ unless response.code == '200'
57
+ raise Treat::Exception,
58
+ "response code was not 200 "+
59
+ "OK, but was #{response.code}. "
60
+ end
61
+
62
+ response.read_body do |segment|
63
+ pbar.inc(segment.length) if pbar
64
+ file.write(segment)
65
+ end
66
+
67
+ pbar.finish if pbar
68
+
69
+ end
70
+
71
+ end
72
+
73
+ file.path.to_s
74
+
75
+ rescue Exception => error
76
+ tries += 1
77
+ retry if tries < MaxTries
78
+ raise Treat::Exception,
79
+ "Couldn't download #{url}. (#{error.message})"
80
+ file.delete
81
+ ensure
82
+ file.close
83
+ end
84
+
85
+ end
86
+
87
+ end
@@ -1,74 +1,76 @@
1
- module Treat
2
- # Abstract and concrete structures extending the
3
- # Tree::Node class to represent textual entities:
4
- #
5
- # - Collection
6
- # - Document
7
- # - Zone (a Section, Title, Paragraph, or List)
8
- # - Sentence
9
- # - Phrases
10
- # - Token (a Word, Number, Punctuation, or Symbol).
11
- module Entities
12
- # Cache a list of defined entity types to
13
- # improve performance.
14
- @@list = nil
15
- # Provide a list of defined entity types,
16
- # as non-camel case identifiers.
17
- def self.list
18
- return @@list if @@list
19
- @@list = []
20
- self.constants.each do |constant|
21
- unless constant == :Entity
22
- @@list << ucc(constant).intern
23
- end
24
- end
25
- @@list
1
+ # Entities are Tree structures that represent textual entities
2
+ # (from a collection of texts down to an individual word) with
3
+ # a unique identifier, a value, features, children and dependencies
4
+ # linking them to other textual entities.
5
+ #
6
+ # - A Collection represents a folder containing documents (and folders).
7
+ # - A Document represents a file with a textual content.
8
+ # - A Zone represents a logical division of content in a document.
9
+ # - A Phrase is a group of words; a Sentence is a Phrase with an ender.
10
+ # - A Token represents a Word, a Number, a Punctuation or a Symbol.
11
+ module Treat::Entities
12
+
13
+ # Variables for the singleton class.
14
+ class << self
15
+ # Provide a list of all entity types except Entity,
16
+ # as non_camel_case identifiers.
17
+ attr_accessor :list
18
+ end
19
+
20
+ # Require all entities.
21
+ require 'treat/entities/entities'
22
+
23
+ # Add each constant to the list, except Entity.
24
+ self.list = []
25
+ constants.each do |constant|
26
+ unless constant == :Entity ||
27
+ constant == :Abilities
28
+ self.list << ucc(constant).intern
26
29
  end
27
- # Require Entity first.
28
- require 'treat/entities/entity'
29
- # Then require all possible entities.
30
- require 'treat/entities/collection'
31
- require 'treat/entities/document'
32
- require 'treat/entities/zones'
33
- require 'treat/entities/phrases'
34
- require 'treat/entities/tokens'
35
- # Make the constants buildable.
36
- constants.each do |entity|
37
- define_singleton_method(entity) do |value='', id=nil|
38
- const_get(entity).build(value, id)
39
- end
30
+ end
31
+
32
+ # Make each Entity class buildable magically.
33
+ # This enables to create Entities without calling
34
+ # #new (e.g. Word 'hello').
35
+ constants.each do |entity|
36
+ define_singleton_method(entity) do |value='', id=nil|
37
+ const_get(entity).build(value, id)
40
38
  end
41
- # Create entity lookup table.
42
- # Lookup table
43
- @@match_types = nil
44
- def self.match_types
45
- return @@match_types if @@match_types
46
- list = (Treat::Entities.list + [:entity])
47
- @@match_types = {}
48
- list.each do |type1|
49
- list.each do |type2|
50
- @@match_types[type2] ||= {}
51
- if (type1 == type2) ||
52
- (Treat::Entities.const_get(cc(type1)) <
53
- Treat::Entities.const_get(cc(type2)))
54
- @@match_types[type2][type1] = true
55
- end
39
+ end
40
+
41
+ # Create entity lookup table.
42
+ @@match_types = nil
43
+ def self.match_types
44
+ return @@match_types if @@match_types
45
+ list = (Treat::Entities.list + [:entity])
46
+ @@match_types = {}
47
+ list.each do |type1|
48
+ list.each do |type2|
49
+ @@match_types[type2] ||= {}
50
+ if (type1 == type2) ||
51
+ (Treat::Entities.const_get(cc(type1)) <
52
+ Treat::Entities.const_get(cc(type2)))
53
+ @@match_types[type2][type1] = true
56
54
  end
57
55
  end
58
- @@match_types
59
56
  end
60
- # Return the hierarchy level of the entity
61
- # class, the minimum being a Token and the
62
- # maximum being a Collection.
63
- def self.rank(type)
64
- klass = Entities.const_get(cc(type))
65
- compare = lambda { |a,b| a == b || a < b }
66
- return 0 if compare.call(klass, Token)
67
- return 1 if compare.call(klass, Phrase)
68
- return 2 if compare.call(klass, Sentence)
69
- return 3 if compare.call(klass, Zone)
70
- return 4 if compare.call(klass, Document)
71
- return 5 if compare.call(klass, Collection)
57
+ @@match_types
58
+ end
59
+
60
+ # A bottom-up ordering of general types of entities.
61
+ @@order = [Token, Phrase, Zone, Document, Collection]
62
+
63
+ # Return the hierarchy level of the entity
64
+ # class, the minimum being a Token and the
65
+ # maximum being a Collection.
66
+ #
67
+ # Implement as true comparison functions.
68
+ def self.rank(type)
69
+ klass = Treat::Entities.const_get(cc(type))
70
+ compare = lambda { |a,b| a == b || a < b }
71
+ 1.upto(@@order.size) do |i|
72
+ return i if compare.call(klass, @@order[i])
72
73
  end
73
74
  end
75
+
74
76
  end
@@ -0,0 +1,10 @@
1
+ # Requires the -able mixins for the Entity class.
2
+ module Treat::Entities::Abilities
3
+
4
+ p = 'treat/entities/abilities/*.rb'
5
+
6
+ Dir[Treat.lib + p].each do |f|
7
+ require f
8
+ end
9
+
10
+ end
@@ -0,0 +1,327 @@
1
+ # Represents an object that can be built
2
+ # from a folder of files, a specific file,
3
+ # a string or a numeric object. This class
4
+ # is pretty much self-explanatory.
5
+ module Treat::Entities::Abilities::Buildable
6
+
7
+ require 'treat/helpers/decimal_point_escaper'
8
+ require 'fileutils'
9
+
10
+ # Simple regexps to match common entities.
11
+ WordRegexp = /^[[:alpha:]\-']+$/
12
+ NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
13
+ PunctRegexp = /^[[:punct:]\$]+$/
14
+ UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
15
+ EmailRegexp = /.+\@.+\..+/
16
+
17
+ # Reserved folder names
18
+ Reserved = ['.index']
19
+
20
+ # Build an entity from anything (can be
21
+ # a string, numeric,folder, or file name
22
+ # representing a raw or serialized file).
23
+ def build(file_or_value, options = {})
24
+
25
+ fv = file_or_value.to_s
26
+ if self == Treat::Entities::Document
27
+ if fv =~ UriRegexp
28
+ from_url(fv, options)
29
+ else
30
+ from_file(fv, options)
31
+ end
32
+ elsif self == Treat::Entities::Collection
33
+ if FileTest.directory?(fv)
34
+ from_folder(fv, options)
35
+ else
36
+ create_collection(fv)
37
+ end
38
+ else
39
+ if file_or_value.is_a?(String)
40
+ from_string(file_or_value)
41
+ elsif file_or_value.is_a?(Numeric)
42
+ from_numeric(file_or_value)
43
+ else
44
+ raise Treat::Exception,
45
+ "Unrecognizable input '#{fv}'. "+
46
+ "Please supply a folder, " +
47
+ "filename, string or number."
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ # Build an entity from a string. Type is
54
+ # enforced only if requested or if the entity
55
+ # is user-created (i.e. by calling build
56
+ # instead of from_string directly).
57
+ def from_string(string, enforce_type = false)
58
+
59
+ Treat::Helpers::DecimalPointEscaper.escape!(string)
60
+
61
+ enforce_type = true if caller_method == :build
62
+
63
+ unless self == Treat::Entities::Entity
64
+ return self.new(string) if enforce_type
65
+ end
66
+
67
+ e = anything_from_string(string)
68
+
69
+ if enforce_type && !e.is_a?(self)
70
+ raise "Asked to build a #{cl(self).downcase} "+
71
+ "from \"#{string}\" and to enforce type, "+
72
+ "but type detected was #{cl(e.class).downcase}."
73
+ end
74
+
75
+ e
76
+ end
77
+
78
+ # Build a document from an URL.
79
+ def from_url(url, options)
80
+ unless self ==
81
+ Treat::Entities::Document
82
+ raise Treat::Exception,
83
+ 'Cannot create something ' +
84
+ 'else than a document from a url.'
85
+ end
86
+
87
+ uri = ::URI.parse(url)
88
+
89
+ sp = uri.path.split('/')
90
+ sp.shift if sp[0] == ''
91
+
92
+ file = sp[-1]
93
+ path = sp.size == 1 ?
94
+ '/' : sp[0..-2].join('/')
95
+
96
+ f = Treat::Downloader.download(
97
+ uri.scheme, uri.host, path, file)
98
+ options[:_default_format] = :html
99
+
100
+ e = from_file(f, options)
101
+ e.set :url, url
102
+ e
103
+
104
+ end
105
+
106
+ # Build an entity from a Numeric object.
107
+ def from_numeric(numeric)
108
+ unless (self ==
109
+ Treat::Entities::Number) ||
110
+ (self == Treat::Entities::Token) ||
111
+ (self == Treat::Entities::Entity)
112
+ raise Treat::Exception,
113
+ "Cannot create something " +
114
+ "else than a number/token from " +
115
+ "a numeric object."
116
+ end
117
+ n = numeric.to_s
118
+ Treat::Helpers::DecimalPointEscaper.unescape!(n)
119
+ Treat::Entities::Number.new(n)
120
+ end
121
+
122
+ # Build an entity from a folder with documents.
123
+ # Folders will be searched recursively.
124
+ def from_folder(folder, options)
125
+
126
+ return if Reserved.include?(folder)
127
+
128
+ unless FileTest.directory?(folder)
129
+ raise Treat::Exception,
130
+ "Path '#{folder}' does " +
131
+ "not point to a folder."
132
+ end
133
+
134
+ unless File.readable?(folder)
135
+ raise Treat::Exception,
136
+ "Folder '#{folder}' is not readable."
137
+ end
138
+
139
+ unless self ==
140
+ Treat::Entities::Collection
141
+ raise Treat::Exception,
142
+ "Cannot create something " +
143
+ "else than a collection " +
144
+ "from folder '#{folder}'."
145
+ end
146
+
147
+ c = Treat::Entities::Collection.new(folder)
148
+ folder += '/' unless folder[-1] == '/'
149
+
150
+ Dir[folder + '*'].each do |f|
151
+ if FileTest.directory?(f)
152
+ c2 = Treat::Entities::Collection.
153
+ from_folder(f, options)
154
+ c.<<(c2, false) if c2
155
+ else
156
+ c.<<(Treat::Entities::Document.
157
+ from_file(f, options), false)
158
+ end
159
+ end
160
+ c
161
+
162
+ end
163
+
164
+ # Build a document from a raw or serialized file.
165
+ def from_file(file, options)
166
+
167
+ unless File.readable?(file)
168
+ raise Treat::Exception,
169
+ "Path '#{file}' does not "+
170
+ "point to a readable file."
171
+ end
172
+
173
+ dflt = options[:_default_format]
174
+ fmt = Treat::Formatters::Readers::Autoselect.
175
+ detect_format(file, dflt)
176
+ options[:_format] = fmt
177
+
178
+ if fmt == :yaml || fmt == :yml ||
179
+ (fmt == :xml && is_treat_xml?(file))
180
+ f = from_serialized_file(file, options)
181
+ else
182
+ f = from_raw_file(file, options)
183
+ end
184
+
185
+ end
186
+
187
+ # Build a document from a raw file.
188
+ def from_raw_file(file, options)
189
+
190
+ unless self ==
191
+ Treat::Entities::Document
192
+ raise Treat::Exception,
193
+ "Cannot create something else than a " +
194
+ "document from raw file '#{file}'."
195
+ end
196
+
197
+ d = Treat::Entities::Document.new(file)
198
+
199
+ d.read(:autoselect, options)
200
+
201
+ end
202
+
203
+ # Build an entity from a serialized file.
204
+ def from_serialized_file(file, options)
205
+
206
+ d = Treat::Entities::Document.new(file)
207
+ d.unserialize(:autoselect, options)
208
+ d.children[0].set_as_root!
209
+ d.children[0]
210
+
211
+ end
212
+
213
+ # Build any kind of entity from a string.
214
+ def anything_from_string(string)
215
+
216
+ case cl(self).downcase.intern
217
+ when :document, :collection
218
+ raise Treat::Exception,
219
+ "Cannot create a document or " +
220
+ "collection from a string " +
221
+ "(need a readable file/folder)."
222
+ when :phrase
223
+ phrase_from_string(string)
224
+ when :token
225
+ token_from_string(string)
226
+ when :zone
227
+ zone_from_string(string)
228
+ when :entity
229
+ if string.count(' ') == 0
230
+ token_from_string(string)
231
+ else
232
+ if string.gsub(/[\.\!\?]+/,
233
+ '.').count('.') <= 1 &&
234
+ string.count("\n") == 0
235
+ phrase_from_string(string)
236
+ else
237
+ zone_from_string(string)
238
+ end
239
+ end
240
+ else
241
+ self.new(string)
242
+ end
243
+
244
+ end
245
+
246
+ def check_encoding(string)
247
+ string.encode("UTF-8", undef: :replace) # Fix
248
+ end
249
+
250
+ # Build a phrase from a string.
251
+ def phrase_from_string(string)
252
+
253
+ check_encoding(string)
254
+
255
+ if string.count('.!?') >= 1
256
+ Treat::Entities::Sentence.new(string)
257
+ else
258
+ Treat::Entities::Phrase.new(string)
259
+ end
260
+
261
+ end
262
+
263
+ # Build the right type of token
264
+ # corresponding to a string.
265
+ def token_from_string(string)
266
+
267
+ check_encoding(string)
268
+ if string == "'s" || string == "'S"
269
+ Treat::Entities::Clitic.new(string)
270
+ elsif string =~ WordRegexp &&
271
+ string.count(' ') == 0 &&
272
+ string != '-'
273
+ Treat::Entities::Word.new(string)
274
+ elsif string =~ NumberRegexp
275
+ from_numeric(string)
276
+ elsif string =~ PunctRegexp
277
+ Treat::Entities::Punctuation.new(string)
278
+ elsif string.count('.') > 0 &&
279
+ string =~ UriRegexp
280
+ Treat::Entities::Url.new(string)
281
+ elsif string.count('@') > 0 &&
282
+ string =~ EmailRegexp
283
+ Treat::Entities::Email.new(string)
284
+ else
285
+ Treat::Entities::Symbol.new(string)
286
+ end
287
+ end
288
+
289
+ # Build the right type of zone
290
+ # corresponding to the string.
291
+
292
+ def zone_from_string(string)
293
+
294
+ check_encoding(string)
295
+ dot = string.count('.!?')
296
+ if dot && dot >= 1 && string.count("\n") > 0
297
+ Treat::Entities::Section.new(string)
298
+ elsif string.count('.') == 0 &&
299
+ string.size < 45
300
+ Treat::Entities::Title.new(string)
301
+ else
302
+ Treat::Entities::Paragraph.new(string)
303
+ end
304
+
305
+ end
306
+
307
+ # Eventually find a better way.
308
+ def is_treat_xml?(file)
309
+
310
+ beginning = nil
311
+
312
+ File.open(file) do |w|
313
+ beginning = w.readlines(200)
314
+ end
315
+
316
+ beginning = beginning.join(' ')
317
+ beginning.count('<treat>') > 0
318
+
319
+ end
320
+
321
+ def create_collection(fv)
322
+ debug("Creating new collection in directory #{fv}.")
323
+ FileUtils.mkdir(fv)
324
+ Treat::Entities::Collection.new(fv)
325
+ end
326
+
327
+ end