treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,30 @@
1
+ class Treat::Loaders
2
+
3
+ # A helper class to load a language class
4
+ # registered with the Linguistics gem.
5
+ class Linguistics
6
+
7
+ silence_warnings { require 'linguistics' }
8
+ @@languages = {}
9
+
10
+ def self.load(language)
11
+ if @@languages[language]
12
+ return @@languages[language]
13
+ end
14
+ begin
15
+ l = language.to_s.upcase
16
+ silence_warnings do
17
+ @@languages[language] =
18
+ ::Linguistics.const_get(l)
19
+ end
20
+ rescue RuntimeError
21
+ raise "Ruby Linguistics does " +
22
+ "not have a module installed " +
23
+ "for the #{language} language."
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,27 @@
1
+ class Treat::Loaders
2
+
3
+ # A helper class to load a language class
4
+ # registered with the Linguistics gem.
5
+ class Stanford
6
+
7
+ require 'stanford-core-nlp'
8
+
9
+ StanfordCoreNLP.jar_path =
10
+ Treat.bin + 'stanford/'
11
+
12
+ StanfordCoreNLP.model_path =
13
+ Treat.models + 'stanford/'
14
+
15
+ StanfordCoreNLP.use(
16
+ Treat::Languages.describe(
17
+ Treat.default_language))
18
+
19
+ StanfordCoreNLP.log_file =
20
+ NULL_DEVICE if Treat.silence
21
+
22
+ StanfordCoreNLP.init
23
+ @@loaded = true
24
+
25
+ end
26
+
27
+ end
data/lib/treat/object.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  # Make undefining constants publicly available on any object.
2
2
  Object.module_eval do
3
+ # Unset a constant without private access.
3
4
  def self.const_unset(const)
4
5
  Object.instance_eval { remove_const(const) }
5
6
  end
@@ -1,45 +1,38 @@
1
- module Treat
2
- # Category for processor groups.
3
- #
4
- # A processor group is a group of algorithms for the building
5
- # of trees representing textual entities.
6
- #
7
- # The processor groups include:
8
- #
9
- # - Chunkers : split a text into zone objects.
10
- # - Segmenters : split a text or zone into sentence objects.
11
- # - Tokenizers : split a sentence into Token objects.
12
- # - Parsers: split a sentence into a tree of phrases
13
- # containing other phrases and Token objects, representing
14
- # the syntactic structure.
15
- module Processors
16
- # Chunkers split a text into zones.
17
- module Chunkers
18
- extend Group
19
- self.type = :transformer
20
- self.targets = [:document, :section]
21
- end
22
- # Segmenters split a text or zone into sentences.
23
- module Segmenters
24
- extend Group
25
- self.type = :transformer
26
- self.targets = [:document, :zone]
27
- end
28
- # Tokenizers splits a sentence into Token objects.
29
- module Tokenizers
30
- extend Group
31
- self.type = :transformer
32
- self.targets = [:document, :zone, :phrase]
33
- end
34
- # Parsers split a sentence into phrase objects
35
- # representing its syntactic structure, with the
36
- # Token objects as children of the phrases.
37
- module Parsers
38
- extend Group
39
- self.type = :transformer
40
- self.targets = [:document, :zone, :phrase]
41
- end
42
- # Makes all the groups autoloadable and creates the workers.
43
- extend Treat::Category
1
+ # Processors build trees representing textual entities.
2
+ module Treat::Processors
3
+
4
+ # Chunkers split a document into sections and zones.
5
+ module Chunkers
6
+ extend Treat::Groupable
7
+ self.type = :transformer
8
+ self.targets = [:document]
9
+ self.default = :autoselect
44
10
  end
45
- end
11
+
12
+ # Segmenters split a document or zone into sentences.
13
+ module Segmenters
14
+ extend Treat::Groupable
15
+ self.type = :transformer
16
+ self.targets = [:zone]
17
+ end
18
+
19
+ # Tokenizers splits a sentence into Token objects.
20
+ module Tokenizers
21
+ extend Treat::Groupable
22
+ self.type = :transformer
23
+ self.targets = [:phrase]
24
+ end
25
+
26
+ # Parsers split a sentence into phrase objects
27
+ # representing its syntactic structure, with the
28
+ # Token objects as children of the phrases.
29
+ module Parsers
30
+ extend Treat::Groupable
31
+ self.type = :transformer
32
+ self.targets = [:phrase]
33
+ end
34
+
35
+ # Make Processors categorizable.
36
+ extend Treat::Categorizable
37
+
38
+ end
@@ -0,0 +1,16 @@
1
+ class Treat::Processors::Chunkers::Autoselect
2
+
3
+ def self.chunk(entity, options = {})
4
+ entity.check_has(:format)
5
+ begin
6
+ k = Treat::Processors::
7
+ Chunkers.const_get(cc(entity.format))
8
+ k.chunk(entity, options)
9
+ rescue Treat::Exception
10
+ Treat::Processors::
11
+ Chunkers::TXT.chunk(entity, options)
12
+ end
13
+
14
+ end
15
+
16
+ end
@@ -0,0 +1,71 @@
1
+ class Treat::Processors::Chunkers::HTML
2
+
3
+ require 'nokogiri'
4
+
5
+ def self.chunk(entity, options = {})
6
+
7
+ entity.check_hasnt_children
8
+
9
+ doc = Nokogiri::HTML(entity.value)
10
+ recurse(entity, doc)
11
+
12
+ end
13
+
14
+ def self.recurse(node, html_node, level = 1)
15
+
16
+ html_node.children.each do |child|
17
+
18
+ next if child.name == 'text'
19
+
20
+ txt = child.inner_text
21
+
22
+ if child.name =~ /^h([0-9]{1})$/ ||
23
+ (child.name == 'p' && txt.length < 45 &&
24
+ node.parent && node.parent.type == :section)
25
+
26
+ if $1
27
+ lvl = $1.to_i
28
+ if lvl <= level
29
+ node.ancestors_with_type(:section).
30
+ each do |s|
31
+ l = s.has?(:level) ? s.level : 1
32
+ node = s if l == lvl - 1
33
+ end
34
+ node = node <<
35
+ Treat::Entities::Section.new
36
+ elsif lvl > level
37
+ node = node <<
38
+ Treat::Entities::Section.new
39
+ end
40
+ level = lvl
41
+ node.set :level, level
42
+
43
+ end
44
+
45
+ t = node <<
46
+ Treat::Entities::Title.new(txt)
47
+ t.set :level, level
48
+
49
+ elsif child.name == 'p'
50
+
51
+ node << Treat::Entities::Zone.
52
+ from_string(txt)
53
+
54
+ elsif ['ul', 'ol'].include?(child.name)
55
+ node = node <<
56
+ Treat::Entities::List.new
57
+ elsif ['li'].include?(child.name)
58
+ n = Treat::Entities::Entity.
59
+ zone_from_string(txt)
60
+ node << n
61
+ end
62
+
63
+ if child.children.size > 0
64
+ recurse(node, child, level)
65
+ end
66
+
67
+ end
68
+
69
+ end
70
+
71
+ end
@@ -1,27 +1,21 @@
1
- module Treat
2
- module Processors
3
- module Chunkers
4
- # This class separates a plain text file into
5
- # zones based on an extremely naive analysis of the
6
- # file. Suprisingly, this works pretty well.
7
- class Txt
8
- # Split a document into Zone objects.
9
- def self.chunk(text, options = {})
10
- zones = text.to_s.split("\n")
11
- zones.each do |zone|
12
- zone.strip!
13
- next if zone == ''
14
- if false # fix
15
- text << Treat::Entities::List.new(zone)
16
- end
17
- if zone.length < 60
18
- text << Treat::Entities::Title.new(zone)
19
- else
20
- text << Treat::Entities::Paragraph.new(zone)
21
- end
22
- end
23
- end
24
- end
1
+ class Treat::Processors::Chunkers::TXT
2
+
3
+ # Separates a string into
4
+ # zones on the basis of newlines.
5
+ #
6
+ # Options: none.
7
+ def self.chunk(entity, options = {})
8
+
9
+ entity.check_hasnt_children
10
+ zones = entity.to_s.split("\n")
11
+
12
+ zones.each do |zone|
13
+ zone.strip!
14
+ next if zone == ''
15
+ entity << Treat::Entities::
16
+ Zone.from_string(zone)
25
17
  end
18
+
26
19
  end
20
+
27
21
  end
@@ -1,218 +1,263 @@
1
- module Treat
2
- module Processors
3
- module Parsers
4
- # The Enju class is a wrapper for the Enju syntactic
5
- # parser for English. Given a file or string input,
6
- # the parser formats it runs it through Enju, and
7
- # parses the XML output by Enju using the Nokogiri
8
- # XML reader. It creates wrappers for the sentences,
9
- # syntactical phrases and tokens that Enju identified.
10
- #
11
- # Original paper:
12
- # Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
13
- # 2007. Efficient HPSG Parsing with Supertagging and
14
- # CFG-filtering. In Proceedings of IJCAI 2007.
15
- class Enju
16
- # Require the 'open13' library for interaction
17
- # with the background Enju process.
18
- require 'open3'
19
- @@parsers = []
20
- @@i = 0
21
- # Require the Nokogiri XML parser.
22
- require 'nokogiri'
23
- # Return the process running Enju.
24
- def self.proc
25
- if @@parsers.size < @@options[:processes]
26
- @@parsers << ::Open3.popen3("enju -xml -i")
27
- end
28
- @@i += 1
29
- @@i = 0 if @@i == @@parsers.size
30
- @@parsers[@@i-1]
31
- end
32
- # Parse the entity into its syntactical phrases using Enju.
33
- # Calls #build to initiate XML parsing.
34
- def self.parse(entity, options = {})
35
- val = entity.to_s
36
- entity.remove_all! if entity.has_children?
37
- options[:processes] ||= 1
38
- @@options = options
39
- @@id_table = {}
40
- @@dependencies_table = {}
41
- stdin, stdout = proc
42
- text, remove_last = valid_text(val)
43
- stdin.puts(text + "\n")
44
- parsed = build(stdout.gets, remove_last)
45
- if not parsed.nil?
46
- entity.remove_all!
47
- parsed.children.each do |child|
48
- entity << child
49
- end
50
- # Remove the period we added at the end.
51
- if remove_last
52
- last = entity.punctuations[-1]
53
- entity.remove!(last)
54
- end
55
- else
56
- warn "Couldn't parse the text '#{entity.to_s}'."
57
- end
58
- link_heads(entity)
59
- add_dependencies(entity)
60
- end
61
- # Parses an Enju XML output file using the Nogoriki
62
- # XML reader and converts that structure into a tree
63
- # of wrappers for textual entities.
64
- def self.build(xml, remove_last = false)
65
- # Read in the XML file.
66
- xml_reader = Nokogiri::XML::Reader.from_memory(xml)
67
- current_element = nil
68
- previous_depth = 0
69
- # Read the XML file entity by entity.
70
- while xml_reader.read
71
- # The depth in the XML tree.
72
- current_depth = xml_reader.depth
73
- # If we are at the end of the children stack, pop up.
74
- if previous_depth > current_depth
75
- current_element = current_element.parent
76
- end
77
- # If an end element has been reached,
78
- # change the depth and pop up on next
79
- # iteration.
80
- if xml_reader.node_type ==
81
- Nokogiri::XML::Reader::TYPE_END_ELEMENT
82
- previous_depth = current_depth
83
- next
84
- end
85
- # Get and format attributes and dependencies.
86
- attributes = xml_reader.attributes
87
- id = attributes.delete('id')
88
- new_attributes = {}; dependencies = {}
89
- unless attributes.size == 0
90
- new_attributes, dependencies =
91
- cleanup_attributes(xml_reader.name, attributes)
92
- end
93
- # Create the appropriate entity for the
94
- # element.
95
- current_value = ''
96
- case xml_reader.name
97
- when 'sentence'
98
- current_element = Treat::Entities::Sentence.new('')
99
- @@id_table[id] = current_element.id
100
- @@dependencies_table[current_element.id] = dependencies
101
- current_element.features = new_attributes
102
- when 'cons'
103
- current_element = current_element <<
104
- Treat::Entities::Phrase.new('')
105
- @@id_table[id] = current_element.id
106
- @@dependencies_table[current_element.id] = dependencies
107
- current_element.features = new_attributes
108
- when 'tok'
109
- tmp_attributes = new_attributes
110
- tmp_dependencies = dependencies
111
- else
112
- current_value = xml_reader.value.gsub(/\s+/, "")
113
- unless current_value.size == 0
114
- current_element = current_element <<
115
- Treat::Entities::Token.from_string(current_value)
116
- if current_element.is_a?(Treat::Entities::Word)
117
- current_element.features = tmp_attributes
118
- @@id_table[id] = current_element.id
119
- @@dependencies_table[current_element.id] = tmp_dependencies
120
- end
121
- end
122
- end
123
- previous_depth = current_depth
124
- end
125
- current_element
126
- end
127
- # Validate a text - Enju wants period to parse a sentence.
128
- def self.valid_text(val)
129
- if val.count('.') == 0
130
- remove_last = true
131
- text = val + '.'
1
+ # This class is a wrapper for the Enju syntactic
2
+ # parser for English. Given an entity's string value,
3
+ # the parser formats it runs it through Enju, and
4
+ # parses the XML output by Enju using the Nokogiri
5
+ # XML reader. It creates wrappers for the sentences,
6
+ # syntactical phrases and tokens that Enju identified.
7
+ #
8
+ # Original paper:
9
+ #
10
+ # Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
11
+ # 2007. Efficient HPSG Parsing with Supertagging and
12
+ # CFG-filtering. In Proceedings of IJCAI 2007.
13
+ module Treat::Processors::Parsers::Enju
14
+
15
+ # Require the 'open3' library to connect
16
+ # with the background Enju process.
17
+ require 'open3'
18
+
19
+ # Require the Nokogiri XML parser.
20
+ require 'nokogiri'
21
+
22
+ # Create only one process and hold on to it.
23
+ @@parser = nil
24
+
25
+ # A hash of Enju cat tags mapped to word categories.
26
+ Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
27
+
28
+ # A hash of Enju cat/xcat pairs mapped to PTB tags.
29
+ Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
30
+
31
+ # Parse the entity into its syntactical
32
+ # phrases using Enju.
33
+ #
34
+ # Options: none.
35
+ def self.parse(entity, options = {})
36
+
37
+ entity.check_hasnt_children
38
+ val = entity.to_s
39
+
40
+ @@id_table = {}
41
+ @@dependencies_table = {}
42
+
43
+ stdin, stdout = proc
44
+ text, remove_last = valid_text(val)
45
+ stdin.puts(text + "\n")
46
+
47
+ parsed = build(stdout.gets, remove_last)
48
+
49
+ if parsed
50
+ entity.remove_all!
51
+ parsed.children.each do |child|
52
+ entity << child
53
+ end
54
+ # Remove the period we added at the end.
55
+ if remove_last
56
+ last = entity.punctuations[-1]
57
+ entity.remove!(last)
58
+ end
59
+ else
60
+ warn "Warning - Enju couldn't " +
61
+ "parse the text '#{entity.short_value}'."
62
+ return
63
+ end
64
+
65
+ link_heads(entity)
66
+ add_dependencies(entity)
67
+ end
68
+
69
+ # Return the process running Enju.
70
+ def self.proc
71
+ begin
72
+ @@parser = ::Open3.popen3("enju -xml -i")
73
+ rescue Exception => e
74
+ raise Treat::Exception,
75
+ "Couldn't initialize Enju: #{e.message}."
76
+ end
77
+ @@parser
78
+ end
79
+
80
+ # Parses an Enju XML output file using the Nogoriki
81
+ # XML reader and converts that structure into a tree
82
+ # of wrappers for textual entities.
83
+ def self.build(xml, remove_last = false)
84
+ # Read in the XML file.
85
+ reader = Nokogiri::XML::Reader.from_memory(xml)
86
+ entity = nil
87
+ pd = 0
88
+ # Read the XML file entity by entity.
89
+ while reader.read
90
+ # The depth in the XML tree.
91
+ cd = reader.depth
92
+ # If we are at the end of the
93
+ # children stack, pop up.
94
+ if pd > cd
95
+ entity = entity.parent
96
+ end
97
+ # If an end element has been reached,
98
+ # change the depth and pop up on next
99
+ # iteration.
100
+ if reader.node_type ==
101
+ Nokogiri::XML::Reader::TYPE_END_ELEMENT
102
+ pd = cd
103
+ next
104
+ end
105
+ # Get and format attributes and dependencies.
106
+ attributes = reader.attributes
107
+ id = attributes.delete('id')
108
+ new_attr = {}; dependencies = {}
109
+ unless attributes.size == 0
110
+ new_attr, dependencies =
111
+ cleanup_attributes(reader.name, attributes)
112
+ end
113
+ # Create the appropriate entity for the
114
+ # element.
115
+ current_value = ''
116
+ case reader.name
117
+ when 'sentence'
118
+ entity = Treat::Entities::Sentence.new('')
119
+ @@id_table[id] = entity.id
120
+ @@dependencies_table[entity.id] = dependencies
121
+ entity.features = new_attr
122
+ when 'cons'
123
+ entity = entity <<
124
+ Treat::Entities::Phrase.new('')
125
+ @@id_table[id] = entity.id
126
+ @@dependencies_table[entity.id] = dependencies
127
+ entity.features = new_attr
128
+ when 'tok'
129
+ tmp_attributes = new_attr
130
+ tmp_dependencies = dependencies
131
+ else
132
+ current_value = reader.value.gsub(/\s+/, "")
133
+ unless current_value.size == 0
134
+ entity = entity <<
135
+ Treat::Entities::Token.from_string(current_value)
136
+ if entity.is_a?(Treat::Entities::Word)
137
+ entity.features = tmp_attributes
138
+ @@id_table[id] = entity.id
139
+ @@dependencies_table[entity.id] = tmp_dependencies
132
140
  else
133
- remove_last = false
134
- text = val.gsub('.', '')
135
- text += '.' unless ['!', '?'].include?(text[-1])
136
- end
137
- return text, remove_last
138
- end
139
- # Link the head and sem_head to their entities.
140
- def self.link_heads(entity)
141
- entity.each_phrase do |phrase|
142
- if phrase.has?(:head)
143
- phrase.link(@@id_table[phrase.head], 'head', true, -1)
144
- phrase.unset(:head)
145
- end
146
- if phrase.has?(:sem_head)
147
- phrase.link(@@id_table[phrase.sem_head], 'sem_head', true, -1)
148
- phrase.unset(:sem_head)
149
- end
141
+ # Do something useful here
142
+ entity.set :tag, 'SYM'
150
143
  end
151
144
  end
152
- # Add dependencies a posterior to a parsed entity.
153
- def self.add_dependencies(entity2)
154
- entity2.each_entity(:word, :phrase) do |entity|
155
- @@dependencies_table.each_pair do |id2, dependencies2|
156
- # Next if there are no dependencies.
157
- next if dependencies2.nil?
158
- entity = entity2.root.find(id2)
159
- next if entity.nil?
160
- dependencies2.each_pair do |argument, type|
161
- # Skip this argument if we don't know the target node.
162
- next if argument == 'unk'
163
- entity.link(@@id_table[argument], type.intern)
164
- end
165
- end
166
- end
145
+ end
146
+ pd = cd
147
+ end
148
+ entity
149
+ end
150
+
151
+ # Validate a text - Enju wants period to parse a sentence.
152
+ def self.valid_text(val)
153
+ if val.count('.') == 0
154
+ remove_last = true
155
+ text = val + '.'
156
+ else
157
+ remove_last = false
158
+ text = val.gsub('.', '')
159
+ text += '.' unless ['!', '?'].include?(text[-1])
160
+ end
161
+ return text, remove_last
162
+ end
163
+
164
+ # Link the head and sem_head to their entities.
165
+ def self.link_heads(entity)
166
+ entity.each_phrase do |phrase|
167
+ if phrase.has?(:head)
168
+ phrase.link(
169
+ @@id_table[phrase.head],
170
+ 'head', true, -1)
171
+ phrase.unset(:head)
172
+ end
173
+ if phrase.has?(:sem_head)
174
+ phrase.link(
175
+ @@id_table[phrase.sem_head],
176
+ 'sem_head', true, -1)
177
+ phrase.unset(:sem_head)
178
+ end
179
+ end
180
+ end
181
+
182
+ # Add dependencies a posteriori to a parsed entity.
183
+ def self.add_dependencies(entity2)
184
+
185
+ entity2.each_entity(:word, :phrase) do |entity|
186
+ @@dependencies_table.each_pair do |id, dependencies|
187
+ next if dependencies.nil?
188
+ entity = entity2.root.find(id)
189
+ next if entity.nil?
190
+ dependencies.each_pair do |argument, type|
191
+ # Skip this argument if we
192
+ # don't know the target node.
193
+ next if argument == 'unk'
194
+ entity.link(
195
+ @@id_table[argument],
196
+ type.intern
197
+ )
167
198
  end
168
- # Helper function to convert Enju attributes to Treat attributes.
169
- def self.cleanup_attributes(name, attributes)
170
- new_attributes = {}
171
- dependencies = {}
172
- pred = attributes.delete('pred')
173
- attributes.each_pair do |attribute2, value|
174
- attribute = attribute2.strip
175
- if attribute == 'arg1' || attribute == 'arg2'
176
- dependencies[value] = pred
177
- next
178
- end
179
- if attribute == 'cat'
180
- new_attributes[:cat] = value
181
- if name == 'tok'
182
- if value.length > 1 && ['P', 'X'].include?(value[-1]) &&
183
- value != 'PN'
184
- new_attributes[:saturated] = (value[-1] == 'P')
185
- value = value[0..-2]
186
- end
187
- new_attributes[:category] =
188
- Treat::Languages::Tags::EnjuCatToCategory[value]
189
- else
190
- tags = Treat::Languages::Tags::EnjuCatXcatToPTB.select do |m|
191
- m[0] == value && m[1] == attributes['xcat']
192
- end
193
- tag = (tags.size == 0) ? 'FW' : tags[0][2]
194
- new_attributes[:tag] = tag
195
- end
196
- else
197
- new_attributes[:"#{attribute}"] = value
198
- end
199
- end
200
- # Delete after iteration.
201
- attributes.delete('arg1')
202
- attributes.delete('arg2')
203
- # Handle naming conventions.
204
- if attributes.has_key?('pos')
205
- new_attributes[:tag] = new_attributes[:pos]
206
- new_attributes[:tag_set] = :penn
207
- new_attributes.delete :pos
199
+ end
200
+ end
201
+
202
+ end
203
+
204
+ # Helper function to convert Enju attributes to Treat attributes.
205
+ def self.cleanup_attributes(name, attributes)
206
+
207
+ new_attr = {}
208
+ dependencies = {}
209
+ pred = attributes.delete('pred')
210
+
211
+ attributes.each_pair do |attribute2, value|
212
+
213
+ attribute = attribute2.strip
214
+
215
+ if attribute == 'arg1' ||
216
+ attribute == 'arg2'
217
+ dependencies[value] = pred
218
+ next
219
+ end
220
+
221
+ if attribute == 'cat'
222
+ new_attr[:cat] = value
223
+ if name == 'tok'
224
+ if value.length > 1 &&
225
+ ['P', 'X'].include?(value[-1]) &&
226
+ value != 'PN'
227
+ new_attr[:saturated] =
228
+ (value[-1] == 'P')
229
+ value = value[0..-2]
208
230
  end
209
- if attributes.has_key?('base')
210
- new_attributes[:lemma] = new_attributes[:base]
211
- new_attributes.delete :base
231
+ new_attr[:category] = Ectc[value]
232
+ else
233
+ tags = Ecxtp.select do |m|
234
+ m[0] == value && m[1] ==
235
+ attributes['xcat']
212
236
  end
213
- return new_attributes, dependencies
237
+ tag = (tags.size == 0) ?
238
+ 'FW' : tags[0][2]
239
+ new_attr[:tag] = tag
214
240
  end
241
+ else
242
+ new_attr[:"#{attribute}"] = value
215
243
  end
244
+
245
+ end
246
+
247
+ # Handle naming conventions.
248
+ if attributes.has_key?('pos')
249
+ new_attr[:tag] = new_attr[:pos]
250
+ new_attr[:tag_set] = :penn
251
+ new_attr.delete :pos
252
+ end
253
+
254
+ if attributes.has_key?('base')
255
+ new_attr[:lemma] = new_attr[:base]
256
+ new_attr.delete :base
216
257
  end
258
+
259
+ return new_attr, dependencies
260
+
217
261
  end
218
- end
262
+
263
+ end