treat 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/LICENSE +0 -1
  2. data/files/INFO +1 -1
  3. data/lib/treat/entities/abilities/buildable.rb +2 -6
  4. data/lib/treat/entities/abilities/checkable.rb +2 -2
  5. data/lib/treat/entities/abilities/delegatable.rb +2 -2
  6. data/lib/treat/entities/abilities/doable.rb +6 -1
  7. data/lib/treat/entities/abilities/iterable.rb +8 -0
  8. data/lib/treat/entities/abilities/magical.rb +1 -1
  9. data/lib/treat/extractors.rb +1 -1
  10. data/lib/treat/formatters/visualizers/standoff.rb +1 -1
  11. data/lib/treat/groupable.rb +4 -0
  12. data/lib/treat/installer.rb +33 -13
  13. data/lib/treat/kernel.rb +0 -4
  14. data/lib/treat/languages/arabic.rb +1 -1
  15. data/lib/treat/languages/chinese.rb +1 -1
  16. data/lib/treat/languages/dutch.rb +1 -1
  17. data/lib/treat/languages/english.rb +1 -1
  18. data/lib/treat/languages/french.rb +4 -4
  19. data/lib/treat/languages/german.rb +3 -3
  20. data/lib/treat/languages/italian.rb +1 -1
  21. data/lib/treat/{linguistics/categories.rb → languages/language.rb} +3 -4
  22. data/lib/treat/languages/polish.rb +1 -1
  23. data/lib/treat/languages/portuguese.rb +1 -1
  24. data/lib/treat/languages/russian.rb +1 -1
  25. data/lib/treat/languages/spanish.rb +1 -1
  26. data/lib/treat/languages/swedish.rb +1 -1
  27. data/lib/treat/lexicalizers/categorizers/from_tag.rb +14 -10
  28. data/lib/treat/lexicalizers/taggers/brill.rb +1 -1
  29. data/lib/treat/lexicalizers/taggers/stanford.rb +5 -2
  30. data/lib/treat/lexicalizers.rb +2 -1
  31. data/lib/treat/processors/parsers/enju.rb +2 -2
  32. data/lib/treat/processors/parsers/stanford.rb +17 -11
  33. data/lib/treat/processors/segmenters/punkt.rb +5 -2
  34. data/lib/treat/processors/segmenters/tactful.rb +5 -1
  35. data/lib/treat/processors/tokenizers/ptb.rb +11 -3
  36. data/lib/treat/processors/tokenizers/punkt.rb +0 -3
  37. data/lib/treat/processors/tokenizers/tactful.rb +3 -0
  38. data/lib/treat/universalisation/encodings.rb +12 -0
  39. data/lib/treat/{linguistics → universalisation}/tags.rb +77 -46
  40. data/lib/treat/universalisation.rb +9 -0
  41. data/lib/treat.rb +2 -2
  42. metadata +6 -6
  43. data/lib/treat/linguistics.rb +0 -9
  44. data/lib/treat/processors/tokenizers/perl.rb +0 -132
data/LICENSE CHANGED
@@ -20,7 +20,6 @@ Non-trivial amount of code has been incorporated and modified from other librari
20
20
  - formatters/readers/odt.rb - Mark Watson (GPL license)
21
21
  - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
22
22
  - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
23
- - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
24
23
  - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
25
24
  - extractors/topics/reuters.rb - Mark Watson (GPL license)
26
25
  - inflectors/declensions/english.rb - Thomas Sawyer (MIT license)
data/files/INFO CHANGED
@@ -1 +1 @@
1
- This is a folder containing the files downloaded by Treat.
1
+ This is a folder containing the files downloaded by Treat from the internet.
@@ -4,12 +4,11 @@
4
4
  # is pretty much self-explanatory.
5
5
  module Treat::Entities::Abilities::Buildable
6
6
 
7
- require 'treat/helpers/decimal_point_escaper'
8
7
  require 'fileutils'
9
8
 
10
9
  # Simple regexps to match common entities.
11
10
  WordRegexp = /^[[:alpha:]\-']+$/
12
- NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
11
+ NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
13
12
  PunctRegexp = /^[[:punct:]\$]+$/
14
13
  UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
15
14
  EmailRegexp = /.+\@.+\..+/
@@ -57,8 +56,6 @@ module Treat::Entities::Abilities::Buildable
57
56
  # instead of from_string directly).
58
57
  def from_string(string, enforce_type = false)
59
58
 
60
- Treat::Helpers::DecimalPointEscaper.escape!(string)
61
-
62
59
  enforce_type = true if caller_method == :build
63
60
 
64
61
  unless self == Treat::Entities::Entity
@@ -74,6 +71,7 @@ module Treat::Entities::Abilities::Buildable
74
71
  end
75
72
 
76
73
  e
74
+
77
75
  end
78
76
 
79
77
  # Build a document from an URL.
@@ -116,7 +114,6 @@ module Treat::Entities::Abilities::Buildable
116
114
  "a numeric object."
117
115
  end
118
116
  n = numeric.to_s
119
- Treat::Helpers::DecimalPointEscaper.unescape!(n)
120
117
  Treat::Entities::Number.new(n)
121
118
  end
122
119
 
@@ -319,7 +316,6 @@ module Treat::Entities::Abilities::Buildable
319
316
  end
320
317
 
321
318
  def create_collection(fv)
322
- debug("Creating new collection in directory #{fv}.")
323
319
  FileUtils.mkdir(fv)
324
320
  Treat::Entities::Collection.new(fv)
325
321
  end
@@ -24,8 +24,8 @@ module Treat::Entities::Abilities::Checkable
24
24
  return unless has_children?
25
25
  raise Treat::Exception,
26
26
  "Warning: can't #{caller_method(2)} "+
27
- "an entity that has children. Removing " +
28
- " all children of text \"[#{short_value}].\""
27
+ "the text \"#{short_value}\", because it " +
28
+ "already has children."
29
29
  end
30
30
 
31
31
  end
@@ -104,9 +104,9 @@ module Treat::Entities::Abilities::Delegatable
104
104
  if !klass[g] || !klass[g][0]
105
105
  d = ucc(cl(group))
106
106
  d.gsub!('_', ' ')
107
- d = 'worker to find "' + d
107
+ d = d[0..-2]
108
108
  raise Treat::Exception, "No #{d}" +
109
- "\" is available for the " +
109
+ " is available for the " +
110
110
  "#{lang.to_s.capitalize} language."
111
111
  end
112
112
  return klass[g][0]
@@ -37,8 +37,13 @@ module Treat::Entities::Abilities::Doable
37
37
  end
38
38
  if f || entity_types.include?(:entity)
39
39
  send(task, worker, options)
40
+ if group.recursive
41
+ each do |entity|
42
+ entity.do_task(task, worker, options, group)
43
+ end
44
+ end
40
45
  else
41
- each_entity(*entity_types) do |entity|
46
+ each do |entity|
42
47
  entity.do_task(task, worker, options, group)
43
48
  end
44
49
  unless entity_types.include?(type)
@@ -95,6 +95,14 @@ module Treat::Entities::Abilities::Iterable
95
95
  as
96
96
  end
97
97
 
98
+ # Returns the first ancestor that has a feature
99
+ # with the given name, otherwise nil.
100
+ def ancestor_with_feature(type, feature)
101
+ each_ancestor(type) do |ancestor|
102
+ return ancestor if ancestor.has?(feature)
103
+ end
104
+ end
105
+
98
106
  alias :ancestors_with_type :ancestors_with_types
99
107
 
100
108
  # Number of children that have a given feature.
@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
25
25
  def magic(sym, *args)
26
26
 
27
27
  @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
28
- @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
28
+ @@cats_regexp ||= "(#{Treat::Languages::Language::WordCategories.join('|')})"
29
29
 
30
30
  method = sym.to_s =~ /entities/ ?
31
31
  sym.to_s.gsub('entities', 'entitys') :
@@ -27,7 +27,7 @@ module Treat::Extractors
27
27
  module Keywords
28
28
  extend Treat::Groupable
29
29
  self.type = :annotator
30
- self.targets = [:document]
30
+ self.targets = [:document, :section, :zone]
31
31
  end
32
32
 
33
33
  # Extract clusters of topic words from a collection.
@@ -44,7 +44,7 @@ class Treat::Formatters::Visualizers::Standoff
44
44
  end
45
45
 
46
46
  def self.ptb_escape(val)
47
- Treat::Linguistics::Tags::
47
+ Treat::Universalisation::Tags::
48
48
  PTBEscapeCharacters.each do |char, esc|
49
49
  val.gsub!(char, val)
50
50
  end
@@ -95,8 +95,12 @@ module Treat::Groupable
95
95
  attr_accessor :presets
96
96
  # The preset option to use with preset functions.
97
97
  attr_accessor :preset_option
98
+ # Whether to recurse within multiple targets or not.
99
+ attr_accessor :recursive
98
100
  end
99
101
 
102
+ self.recursive = false
103
+
100
104
  # Return the method corresponding to the group.
101
105
  # This method resolves the name of the method
102
106
  # that a group should provide based on the name
@@ -82,9 +82,13 @@ module Treat::Installer
82
82
  begin
83
83
  Gem::Specification.find_by_name('punkt-segmenter')
84
84
  title "Downloading model for the Punkt segmenter for the #{l}."
85
- download_punkt_models(language)
85
+ # Need fix
86
+ download_punkt_models([language.to_s])
86
87
  rescue Gem::LoadError; end
87
-
88
+
89
+ # Download reuters models always
90
+ download_reuters_models
91
+
88
92
  # If stanford is installed, download models.
89
93
  begin
90
94
  Gem::Specification.find_by_name('stanford-core-nlp')
@@ -92,7 +96,10 @@ module Treat::Installer
92
96
  "model files for the the #{l}.\n\n"
93
97
  package = (language == :english) ? :english : :all
94
98
  download_stanford(package)
95
- rescue Gem::LoadError; end
99
+ rescue Gem::LoadError
100
+ puts 'Stanford-core-nlp gem not installed.'
101
+ puts 'Skipping download of Stanford models.'
102
+ end
96
103
 
97
104
  title "Install external binary libraries " +
98
105
  "(requires port, apt-get or win-get).\n"
@@ -124,7 +131,7 @@ module Treat::Installer
124
131
  install_dependencies(false)
125
132
  install_language_dependencies(dep, false)
126
133
  download_stanford(:minimal)
127
- download_punkt_models(:english)
134
+ download_punkt_models([:english])
128
135
  end
129
136
 
130
137
  def self.install_dependencies(optionally)
@@ -166,7 +173,7 @@ module Treat::Installer
166
173
  unless man
167
174
  puts "Skipping installation of the "+
168
175
  "following binaries:\n\n"
169
- Binaries.each do |binary, purpose|
176
+ Binary.each do |binary, purpose|
170
177
  puts "- #{binary} to #{purpose}"
171
178
  end
172
179
  return
@@ -227,22 +234,35 @@ module Treat::Installer
227
234
 
228
235
  end
229
236
 
230
- def self.download_punkt_models(language)
237
+ def self.download_punkt_models(languages)
238
+ languages.map! { |l| "#{l}.yaml" }
239
+ download_models 'punkt', languages
240
+ end
241
+
242
+ def self.download_reuters_models
243
+ files = ["industry.xml", "region.xml", "topics.xml"]
244
+ download_models 'reuters', files
245
+ end
231
246
 
232
- f = "#{language}.yaml"
233
- dest = "#{Treat.models}punkt/"
247
+ def self.download_models(directory, files)
234
248
 
235
- loc = Treat::Downloader.download(
236
- 'http', Server, 'treat/punkt', f, Treat.tmp)
249
+ dest = "#{Treat.models}#{directory}/"
237
250
 
238
251
  unless File.readable?(dest)
239
- puts "- Creating directory models/punkt ..."
252
+ puts "- Creating directory models/#{directory} ..."
240
253
  FileUtils.mkdir_p(File.absolute_path(dest))
241
254
  end
242
255
 
243
- puts "- Copying model file to models/punkt ..."
244
- FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
245
256
 
257
+ files.each do |file|
258
+ puts "- Downloading #{file} ..."
259
+ loc = Treat::Downloader.download(
260
+ 'http', Server, "treat/#{directory}", file, Treat.tmp)
261
+ puts "- Copying file to models/#{directory} ..."
262
+ FileUtils.cp(loc, File.join(Paths[:models], directory, file))
263
+ end
264
+
265
+
246
266
  puts "- Cleaning up..."
247
267
  FileUtils.rm_rf(Paths[:tmp] + Server)
248
268
 
data/lib/treat/kernel.rb CHANGED
@@ -181,10 +181,6 @@ module Kernel
181
181
  NULL_DEVICE = '/dev/null'
182
182
  end
183
183
 
184
- def debug(msg)
185
- puts msg if Treat.debug
186
- end
187
-
188
184
  def prompt(msg, valid_answers)
189
185
 
190
186
  msg = msg
@@ -6,7 +6,7 @@ class Treat::Languages::Arabic
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford]
9
+ :taggers => [:stanford]
10
10
  }
11
11
  Processors = {
12
12
  :parsers => [:stanford]
@@ -6,7 +6,7 @@ class Treat::Languages::Chinese
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford]
9
+ :taggers => [:stanford]
10
10
  }
11
11
  Processors = {
12
12
  :parsers => [:stanford]
@@ -9,7 +9,7 @@ class Treat::Languages::Dutch
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -31,7 +31,7 @@ class Treat::Languages::English
31
31
  :chunkers => [:txt],
32
32
  :parsers => [:stanford, :enju],
33
33
  :segmenters => [:tactful, :punkt, :stanford],
34
- :tokenizers => [:perl, :ptb, :stanford, :tactful, :punkt]
34
+ :tokenizers => [:ptb, :stanford, :tactful, :punkt]
35
35
  }
36
36
 
37
37
  Retrievers = {
@@ -6,14 +6,14 @@ class Treat::Languages::French
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford],
10
- :category => [:from_tag]
9
+ :taggers => [:stanford],
10
+ :categorizers => [:from_tag]
11
11
  }
12
12
  Processors = {
13
13
  :chunkers => [:txt],
14
14
  :parsers => [:stanford],
15
- :segmenters => [:punkt],
16
- :tokenizers => [:perl, :tactful]
15
+ :segmenters => [:tactful],
16
+ :tokenizers => [:tactful]
17
17
  }
18
18
  Retrievers = {}
19
19
 
@@ -6,14 +6,14 @@ class Treat::Languages::German
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford],
10
- :category => [:from_tag]
9
+ :taggers => [:stanford],
10
+ :categorizers => [:from_tag]
11
11
  }
12
12
  Processors = {
13
13
  :chunkers => [:txt],
14
14
  :parsers => [:stanford],
15
15
  :segmenters => [:punkt],
16
- :tokenizers => [:perl, :tactful]
16
+ :tokenizers => [:tactful]
17
17
  }
18
18
  Retrievers = {}
19
19
 
@@ -10,7 +10,7 @@ class Treat::Languages::Italian
10
10
  :chunkers => [:txt],
11
11
  :parsers => [:stanford],
12
12
  :segmenters => [:punkt],
13
- :tokenizers => [:perl, :tactful]
13
+ :tokenizers => [:tactful]
14
14
  }
15
15
  Retrievers = {}
16
16
 
@@ -1,11 +1,10 @@
1
- module Treat::Linguistics
2
-
3
- # A list of all possible word categories.
1
+ class Treat::Languages::Language
2
+
4
3
  WordCategories = [
5
4
  :adjective, :adverb, :noun, :verb, :interjection,
6
5
  :clitic, :coverb, :conjunction, :determiner, :particle,
7
6
  :preposition, :pronoun, :number, :symbol, :punctuation,
8
7
  :complementizer
9
8
  ]
10
-
9
+
11
10
  end
@@ -9,7 +9,7 @@ class Treat::Languages::Polish
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Portuguese
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Russian
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Spanish
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Swedish
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -3,17 +3,19 @@
3
3
  # from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
4
4
  class Treat::Lexicalizers::Categorizers::FromTag
5
5
 
6
- Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
7
- Wttc = Treat::Linguistics::Tags::WordTagToCategory
8
- Ptc = Treat::Linguistics::Tags::PunctuationToCategory
6
+ Pttc = Treat::Universalisation::Tags::PhraseTagToCategory
7
+ Wttc = Treat::Universalisation::Tags::WordTagToCategory
8
+ Ptc = Treat::Universalisation::Tags::PunctuationToCategory
9
9
 
10
10
  # Find the category of the entity from its tag.
11
11
  def self.category(entity, options = {})
12
12
 
13
13
  tag = entity.check_has(:tag)
14
+
14
15
  return :unknown if tag.nil? || tag == '' || entity.type == :symbol
15
16
  return :sentence if tag == 'S' || entity.type == :sentence
16
17
  return :number if entity.type == :number
18
+
17
19
  return Ptc[entity.to_s] if entity.type == :punctuation
18
20
 
19
21
  if entity.is_a?(Treat::Entities::Phrase)
@@ -29,15 +31,17 @@ class Treat::Lexicalizers::Categorizers::FromTag
29
31
 
30
32
  if entity.has?(:tag_set)
31
33
  ts = entity.get(:tag_set)
32
- elsif entity.parent_phrase &&
33
- entity.parent_phrase.has?(:tag_set)
34
- ts = entity.parent_phrase.get(:tag_set)
35
34
  else
36
- raise Treat::Exception,
37
- "No information can be found regarding "+
38
- "which tag set to use."
35
+ a = entity.ancestor_with_feature(:phrase, :tag_set)
36
+ if a
37
+ ts = a.get(:tag_set)
38
+ else
39
+ raise Treat::Exception,
40
+ "No information can be found regarding "+
41
+ "which tag set to use."
42
+ end
39
43
  end
40
-
44
+
41
45
  if cat[ts]
42
46
  return cat[ts]
43
47
  else
@@ -35,7 +35,7 @@ module Treat::Lexicalizers::Taggers::Brill
35
35
  # Tokenize the sentence/phrase.
36
36
  if !entity.has_children? &&
37
37
  !entity.is_a?(Treat::Entities::Token)
38
- entity.tokenize(:perl, options)
38
+ entity.tokenize(options)
39
39
  end
40
40
 
41
41
  # Create the tagger if necessary
@@ -38,11 +38,14 @@ class Treat::Lexicalizers::Taggers::Stanford
38
38
  end
39
39
 
40
40
  # Handle tags for sentences and phrases.
41
-
42
41
  if entity.is_a?(Treat::Entities::Sentence) ||
43
42
  (entity.is_a?(Treat::Entities::Phrase) &&
44
43
  !entity.parent_sentence)
45
- entity.set :tag_set, :penn
44
+
45
+ tag_set = Treat::Universalisation::Tags::
46
+ StanfordTagSetForLanguage[
47
+ Treat::Languages.describe(lang)]
48
+ entity.set :tag_set, tag_set
46
49
  end
47
50
 
48
51
  if entity.is_a?(Treat::Entities::Sentence)
@@ -16,7 +16,8 @@ module Treat::Lexicalizers
16
16
  module Categorizers
17
17
  extend Treat::Groupable
18
18
  self.type = :annotator
19
- self.targets = [:token]
19
+ self.targets = [:sentence, :phrase, :token]
20
+ self.recursive = true
20
21
  self.default = :from_tag
21
22
  end
22
23
 
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
23
23
  @@parser = nil
24
24
 
25
25
  # A hash of Enju cat tags mapped to word categories.
26
- Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
26
+ Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
27
27
 
28
28
  # A hash of Enju cat/xcat pairs mapped to PTB tags.
29
- Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
29
+ Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
30
30
 
31
31
  # Parse the entity into its syntactical
32
32
  # phrases using Enju.
@@ -27,6 +27,10 @@ class Treat::Processors::Parsers::Stanford
27
27
  lang = entity.language
28
28
  init(lang, options)
29
29
 
30
+ tag_set = Treat::Universalisation::Tags::
31
+ StanfordTagSetForLanguage[
32
+ Treat::Languages.describe(lang)]
33
+
30
34
  text = ::StanfordCoreNLP::Text.new(val)
31
35
  @@parsers[lang].annotate(text)
32
36
 
@@ -37,17 +41,18 @@ class Treat::Processors::Parsers::Stanford
37
41
  tag = s.get(:category).to_s
38
42
  tag_s, tag_opt = *tag.split('-')
39
43
  tag_s ||= 'S'
40
- entity.set :tag_set, :penn
41
44
  entity.set :tag, tag_s
42
45
  entity.set :tag_opt, tag_opt if tag_opt
43
- recurse(s.get(:tree).children[0], entity)
44
- break
46
+ recurse(s.get(:tree).children[0], entity, tag_set)
47
+ break #######
45
48
  else
46
49
  recurse(s.get(:tree), entity)
47
50
  end
48
51
 
49
52
  end
50
53
 
54
+ entity.set :tag_set, tag_set
55
+
51
56
  end
52
57
 
53
58
  def self.init(lang, options)
@@ -76,7 +81,7 @@ class Treat::Processors::Parsers::Stanford
76
81
 
77
82
  # Helper method which recurses the tree supplied by
78
83
  # the Stanford parser.
79
- def self.recurse(java_node, ruby_node, additional_tags = [])
84
+ def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
80
85
 
81
86
  if java_node.num_children == 0
82
87
 
@@ -85,10 +90,8 @@ class Treat::Processors::Parsers::Stanford
85
90
  tag_s, tag_opt = *tag.split('-')
86
91
  tag_s ||= ''
87
92
  ruby_node.value = java_node.value.to_s.strip
88
- ruby_node.set :tag_set, :penn
89
93
  ruby_node.set :tag, tag_s
90
94
  ruby_node.set :tag_opt, tag_opt if tag_opt
91
- ruby_node.set :tag_set, :penn
92
95
  ruby_node.set :lemma, label.get(:lemma).to_s
93
96
 
94
97
  additional_tags.each do |t|
@@ -103,33 +106,35 @@ class Treat::Processors::Parsers::Stanford
103
106
  if java_node.num_children == 1 &&
104
107
  java_node.children[0].num_children == 0
105
108
  recurse(java_node.children[0],
106
- ruby_node, additional_tags)
109
+ ruby_node, tag_set, additional_tags)
107
110
  return
108
111
  end
109
112
 
110
113
  java_node.children.each do |java_child|
114
+
111
115
  label = java_child.label
112
116
  tag = label.get(:category).to_s
113
117
  tag_s, tag_opt = *tag.split('-')
114
118
  tag_s ||= ''
115
-
116
- if Treat::Linguistics::Tags::PhraseTagToCategory[tag_s]
119
+
120
+ if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
121
+ Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
117
122
  ruby_child = Treat::Entities::Phrase.new
118
123
  else
119
124
  l = java_child.children[0].to_s
120
125
  v = java_child.children[0].value.to_s.strip
126
+
121
127
  # Mhmhmhmhmhm
122
128
  val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
123
129
  ruby_child = Treat::Entities::Token.from_string(val)
124
130
  end
125
131
 
126
- ruby_child.set :tag_set, :penn
127
132
  ruby_child.set :tag, tag_s
128
133
  ruby_child.set :tag_opt, tag_opt if tag_opt
129
134
  ruby_node << ruby_child
130
135
 
131
136
  unless java_child.children.empty?
132
- recurse(java_child, ruby_child, additional_tags)
137
+ recurse(java_child, ruby_child, tag_set, additional_tags)
133
138
  end
134
139
 
135
140
  end
@@ -137,4 +142,5 @@ class Treat::Processors::Parsers::Stanford
137
142
  end
138
143
 
139
144
  end
145
+
140
146
  end
@@ -38,15 +38,18 @@ module Treat::Processors::Segmenters::Punkt
38
38
 
39
39
  s = entity.to_s
40
40
 
41
- # Replace all decimal points by ^^
41
+ # Replace the point in all floating-point numbers
42
+ # by ^^; this is a fix since Punkt trips on decimal
43
+ # numbers.
42
44
  Treat::Helpers::DecimalPointEscaper.escape!(s)
43
- s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
45
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
44
46
 
45
47
  result = @@segmenters[lang].
46
48
  sentences_from_text(s,
47
49
  :output => :sentences_text)
48
50
 
49
51
  result.each do |sentence|
52
+ # Unescape the sentence.
50
53
  Treat::Helpers::DecimalPointEscaper.
51
54
  unescape!(sentence)
52
55
  entity << Treat::Entities::Phrase.
@@ -29,17 +29,21 @@ module Treat::Processors::Segmenters::Tactful
29
29
  entity.check_hasnt_children
30
30
 
31
31
  s = entity.to_s
32
+
32
33
  Treat::Helpers::DecimalPointEscaper.escape!(s)
33
34
 
34
- s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
35
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
35
36
 
36
37
  @@segmenter ||= TactfulTokenizer::Model.new
37
38
 
38
39
  sentences = @@segmenter.tokenize_text(s)
40
+
39
41
  sentences.each do |sentence|
40
42
  Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
43
+ puts sentence.to_s if sentence.to_s.include?('staff')
41
44
  entity << Treat::Entities::Phrase.from_string(sentence)
42
45
  end
46
+
43
47
  end
44
48
 
45
49
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  # A native rule-basd tokenizer based on the one
2
3
  # developped by Robert Macyntyre in 1995 for the Penn
3
4
  # Treebank project. This tokenizer follows the
@@ -11,8 +12,6 @@
11
12
  # you can redistribute it and/or modify it under the
12
13
  # same terms as Ruby itself.
13
14
  module Treat::Processors::Tokenizers::PTB
14
-
15
- require 'treat/helpers/decimal_point_escaper'
16
15
 
17
16
  # Tokenize the entity using a native rule-based algorithm.
18
17
  def self.tokenize(entity, options = {})
@@ -33,8 +32,17 @@ module Treat::Processors::Tokenizers::PTB
33
32
 
34
33
  # Helper method to split the string into tokens.
35
34
  def self.split(string)
35
+
36
36
  s = " " + string + " "
37
- Treat::Helpers::DecimalPointEscaper.escape!(s)
37
+
38
+ # Translate some common extended ascii
39
+ # characters to quotes
40
+ s.gsub!(/‘/,'`')
41
+ s.gsub!(/’/,"'")
42
+ s.gsub!(/“/,"``")
43
+ s.gsub!(/”/,"''")
44
+
45
+
38
46
  s.gsub!(/\s+/," ")
39
47
  s.gsub!(/(\s+)''/,'\1"')
40
48
  s.gsub!(/(\s+)``/,'\1"')
@@ -14,8 +14,6 @@
14
14
  # Project website: https://github.com/lfcipriani/punkt-segmenter
15
15
  class Treat::Processors::Tokenizers::Punkt
16
16
 
17
- require 'treat/helpers/decimal_point_escaper'
18
-
19
17
  SentEndChars = ['.', '?', '!']
20
18
  ReSentEndChars = /[.?!]/
21
19
  InternalPunctuation = [',', ':', ';']
@@ -35,7 +33,6 @@ class Treat::Processors::Tokenizers::Punkt
35
33
  entity.check_hasnt_children
36
34
 
37
35
  s = entity.to_s
38
- Treat::Helpers::DecimalPointEscaper.escape!(s)
39
36
 
40
37
  s.scan(ReWordTokenizer).each do |token|
41
38
  if SentEndChars.include?(token[-1])
@@ -51,6 +51,7 @@ class Treat::Processors::Tokenizers::Tactful
51
51
  entity.check_hasnt_children
52
52
 
53
53
  s = entity.to_s
54
+
54
55
  Treat::Helpers::DecimalPointEscaper.escape!(s)
55
56
 
56
57
  ReTokenize.each do |rules|
@@ -58,6 +59,8 @@ class Treat::Processors::Tokenizers::Tactful
58
59
  end
59
60
 
60
61
  s.split(' ').each do |token|
62
+
63
+ Treat::Helpers::DecimalPointEscaper.unescape!(token)
61
64
  entity << Treat::Entities::Token.
62
65
  from_string(token)
63
66
  end
@@ -0,0 +1,12 @@
1
+ module Treat::Universalisation
2
+
3
+ Encodings = {
4
+ :arabic => 'UTF-8',
5
+ :chinese => 'GB18030',
6
+ :english => 'UTF-8',
7
+ :french => 'ISO_8859-1',
8
+ :german => 'ISO_8859-1',
9
+ :hebrew => 'UTF-8'
10
+ }
11
+
12
+ end
@@ -1,14 +1,20 @@
1
- module Treat::Linguistics::Tags
1
+ module Treat::Universalisation::Tags
2
2
 
3
3
  ClawsC5 = 0
4
4
  Brown = 1
5
5
  Penn = 2
6
- Negra = 3
6
+ Stuttgart = 3
7
7
  PennChinese = 4
8
- Simple = 5
8
+ Paris7 = 5
9
+
10
+ StanfordTagSetForLanguage = {
11
+ :french => :paris7,
12
+ :english => :penn,
13
+ :german => :stuttgart
14
+ }
9
15
 
10
16
  PTBClauseTagDescription = [
11
- ['S', 'Simple declarative clause'],
17
+ ['S', 'Paris7 declarative clause'],
12
18
  ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
13
19
  ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
14
20
  ['SINV', 'Inverted declarative sentence'],
@@ -26,30 +32,33 @@ module Treat::Linguistics::Tags
26
32
 
27
33
  AlignedPhraseTags =
28
34
  [
29
- 'Adjective phrase', ['', '', 'ADJP'],
30
- 'Adverb phrase', ['', '', 'ADVP'],
31
- 'Conjunction phrase', ['', '', 'CONJP'],
32
- 'Fragment', ['', '', 'FRAG'],
33
- 'Interjection', ['', '', 'INTJ'],
34
- 'List marker', ['', '', 'LST'],
35
- 'Not a phrase', ['', '', 'NAC'],
36
- 'Noun phrase', ['', '', 'NP'],
37
- 'Head of NP', ['', '', 'NX'],
38
- 'Prepositional phrase', ['', '', 'PP'],
39
- 'Parenthetical', ['', '', 'PRN'],
40
- 'Particle', ['', '', 'PRT'],
41
- 'Quantifier phrase', ['', '', 'QP'],
42
- 'Reduced relative clause', ['', '', 'RRC'],
43
- 'Unlike coordinated phrase', ['', '', 'UCP'],
44
- 'Verb phrase', ['', '', 'VP'],
45
- 'Wh adjective phrase', ['', '', 'WHADJP'],
46
- 'Wh adverb phrase', ['', '', 'WHAVP'],
47
- 'Wh noun phrase', ['', '', 'WHNP'],
48
- 'Wh prepositional phrase', ['', '', 'WHPP'],
49
- 'Unknown', ['', '', 'X'],
50
- 'Phrase', ['', '', 'P'],
51
- 'Sentence', ['', '', 'S'],
52
- 'Phrase', ['', '', 'SBAR'] # Fix
35
+ 'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
36
+ 'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
37
+ 'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
38
+ 'Fragment', ['', '', 'FRAG', '', '', ''],
39
+ 'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
40
+ 'List marker', ['', '', 'LST', '', '', ''],
41
+ 'Not a phrase', ['', '', 'NAC', '', '', ''],
42
+ 'Noun phrase', ['', '', 'NP', '', '', 'NP'],
43
+ 'Verbal nucleus', ['', '', '', '', '', 'VN'],
44
+ 'Head of noun phrase', ['', '', 'NX', '', '', ''],
45
+ 'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
46
+ 'Parenthetical', ['', '', 'PRN', '', '', ''],
47
+ 'Particle', ['', '', 'PRT', '', '', ''],
48
+ 'Participial phrase', ['', '', '', '', '', 'VPart'],
49
+ 'Quantifier phrase', ['', '', 'QP', '', '', ''],
50
+ 'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
51
+ 'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
52
+ 'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
53
+ 'Verb phrase', ['', '', 'VP', '', '', ''],
54
+ 'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
55
+ 'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
56
+ 'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
57
+ 'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
58
+ 'Unknown', ['', '', 'X', '', '', ''],
59
+ 'Phrase', ['', '', 'P', '', '', 'Sint'],
60
+ 'Sentence', ['', '', 'S', '', '', 'SENT'],
61
+ 'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
53
62
  ]
54
63
 
55
64
  # A description of Enju categories.
@@ -139,12 +148,12 @@ module Treat::Linguistics::Tags
139
148
  # JRS?
140
149
 
141
150
 
142
- SimpleWordTagToCategory = {
151
+ Paris7WordTagToCategory = {
143
152
  'C' => :complementizer,
144
153
  'PN' => :punctuation,
145
154
  'SC' => :conjunction
146
155
  }
147
-
156
+
148
157
  PunctuationToCategory = {
149
158
  '.' => :period,
150
159
  ',' => :comma,
@@ -152,9 +161,8 @@ module Treat::Linguistics::Tags
152
161
  ':' => :colon,
153
162
  '!' => :exclamation,
154
163
  '?' => :interrogation,
155
- '"' => :quote,
156
- "'" => :quote,
157
-
164
+ '"' => :double_quote,
165
+ "'" => :single_quote,
158
166
  '$' => :dollar,
159
167
  '%' => :percent,
160
168
  '#' => :hash,
@@ -227,7 +235,7 @@ module Treat::Linguistics::Tags
227
235
  'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
228
236
  'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
229
237
  'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
230
-
238
+ 'Interjection', ['', '', '', '', '', 'I'],
231
239
  'Localizer', ['', '', '', '', 'LC'],
232
240
 
233
241
  'Measure word', ['', '', '', '', 'M'],
@@ -366,11 +374,25 @@ module Treat::Linguistics::Tags
366
374
  'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
367
375
  'Verb, ????', ['', '', '', '', 'VC'] # ?
368
376
  ]
369
-
370
- wttc = {
371
-
372
- }
373
- Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
377
+
378
+ # Paris7 Treebank functional tags
379
+ =begin
380
+ SUJ (subject)
381
+ OBJ (direct object)
382
+ ATS (predicative complement of a subject)
383
+ ATO (predicative complement of a direct object)
384
+ MOD (modifier or adjunct)
385
+ A-OBJ (indirect complement introduced by à)
386
+ DE-OBJ (indirect complement introduced by de)
387
+ P-OBJ (indirect complement introduced by another preposition)
388
+ =end
389
+
390
+ # !! Extremely ugly code follows.
391
+
392
+ # Generate word tag -> category hash.
393
+ wttc = {}
394
+
395
+ Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
374
396
 
375
397
  category = desc.gsub(',', ' ,').
376
398
  split(' ')[0].downcase.intern
@@ -378,32 +400,41 @@ module Treat::Linguistics::Tags
378
400
  wttc[tags[ClawsC5]] ||= {}
379
401
  wttc[tags[Brown]] ||= {}
380
402
  wttc[tags[Penn]] ||= {}
381
- wttc[tags[Negra]] ||= {}
403
+ wttc[tags[Stuttgart]] ||= {}
382
404
  wttc[tags[PennChinese]] ||= {}
383
- wttc[tags[Simple]] ||= {}
405
+ wttc[tags[Paris7]] ||= {}
384
406
 
385
407
  wttc[tags[ClawsC5]][:claws_5] = category
386
408
  wttc[tags[Brown]][:brown] = category
387
409
  wttc[tags[Penn]][:penn] = category
388
- wttc[tags[Negra]][:negra] = category if tags[Negra]
410
+ wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
389
411
  wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
390
- wttc[tags[Simple]][:simple] = category if tags[Simple]
412
+ wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
391
413
 
392
414
  end
415
+
393
416
  # A hash converting word tags to word categories.
394
417
  WordTagToCategory = wttc
395
418
 
396
419
  # A hash converting phrase tag to categories.
397
420
  pttc = {}
398
- Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
421
+
422
+ Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
423
+
399
424
  category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
425
+
400
426
  pttc[tags[Penn]] ||= {};
427
+ pttc[tags[Paris7]] ||= {};
428
+
429
+ pttc[tags[Penn]][:penn] = category
430
+ pttc[tags[Paris7]][:paris7] = category
431
+
401
432
  # Not yet for other tag sts.
402
433
  #pttc[tags[0]][:claws_5] = category
403
434
  #pttc[tags[1]][:brown] = category
404
- pttc[tags[Penn]][:penn] = category
435
+
405
436
  end
406
-
437
+
407
438
  # A hash converting word tags to word categories.
408
439
  PhraseTagToCategory = pttc
409
440
 
@@ -0,0 +1,9 @@
1
+ module Treat::Universalisation
2
+
3
+ p = 'treat/universalisation/*.rb'
4
+
5
+ Dir[Treat.lib + p].each do |f|
6
+ require f
7
+ end
8
+
9
+ end
data/lib/treat.rb CHANGED
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.4"
13
+ VERSION = "1.0.5"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
@@ -44,7 +44,7 @@ module Treat
44
44
  require 'treat/kernel'
45
45
  require 'treat/downloader'
46
46
  require 'treat/languages'
47
- require 'treat/linguistics'
47
+ require 'treat/universalisation'
48
48
  require 'treat/entities'
49
49
  require 'treat/categories'
50
50
  require 'treat/data_set'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-20 00:00:00.000000000 Z
12
+ date: 2012-05-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip
@@ -161,6 +161,7 @@ files:
161
161
  - lib/treat/languages/german.rb
162
162
  - lib/treat/languages/greek.rb
163
163
  - lib/treat/languages/italian.rb
164
+ - lib/treat/languages/language.rb
164
165
  - lib/treat/languages/list.txt
165
166
  - lib/treat/languages/polish.rb
166
167
  - lib/treat/languages/portuguese.rb
@@ -176,9 +177,6 @@ files:
176
177
  - lib/treat/lexicalizers/taggers/lingua.rb
177
178
  - lib/treat/lexicalizers/taggers/stanford.rb
178
179
  - lib/treat/lexicalizers.rb
179
- - lib/treat/linguistics/categories.rb
180
- - lib/treat/linguistics/tags.rb
181
- - lib/treat/linguistics.rb
182
180
  - lib/treat/loaders/linguistics.rb
183
181
  - lib/treat/loaders/stanford.rb
184
182
  - lib/treat/object.rb
@@ -190,7 +188,6 @@ files:
190
188
  - lib/treat/processors/segmenters/punkt.rb
191
189
  - lib/treat/processors/segmenters/stanford.rb
192
190
  - lib/treat/processors/segmenters/tactful.rb
193
- - lib/treat/processors/tokenizers/perl.rb
194
191
  - lib/treat/processors/tokenizers/ptb.rb
195
192
  - lib/treat/processors/tokenizers/punkt.rb
196
193
  - lib/treat/processors/tokenizers/stanford.rb
@@ -202,6 +199,9 @@ files:
202
199
  - lib/treat/retrievers.rb
203
200
  - lib/treat/server.rb
204
201
  - lib/treat/tree.rb
202
+ - lib/treat/universalisation/encodings.rb
203
+ - lib/treat/universalisation/tags.rb
204
+ - lib/treat/universalisation.rb
205
205
  - lib/treat.rb
206
206
  - spec/collection.rb
207
207
  - spec/document.rb
@@ -1,9 +0,0 @@
1
- module Treat::Linguistics
2
-
3
- p = 'treat/linguistics/*.rb'
4
-
5
- Dir[Treat.lib + p].each do |f|
6
- require f
7
- end
8
-
9
- end
@@ -1,132 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # Tokenize the entity using a native rule-based
4
- # algorithm. This tokenizer is a port from an
5
- # unknown Perl module, which I have lifted from
6
- # the 'rbtagger' gem.
7
- #
8
- # Author: Todd A. Fisher
9
- #
10
- # This code is free to use under the terms of
11
- # the MIT license.
12
- #
13
- # Original project website:
14
- #
15
- # https://github.com/taf2/rb-brill-tagger
16
- module Treat::Processors::Tokenizers::Perl
17
-
18
- require 'treat/helpers/decimal_point_escaper'
19
-
20
- # Tokenize the entity using a rule-based algorithm
21
- # ported from Perl by Todd A. Fisher.
22
- #
23
- # Options: none.
24
- def self.tokenize(entity, options = {})
25
-
26
- entity.check_hasnt_children
27
- s = entity.to_s
28
-
29
- tokens = get_tokens(entity.to_s)
30
- tokens[1..-1].each do |token|
31
- next if token =~ /^\s*$/
32
- entity << Treat::Entities::Token.
33
- from_string(token)
34
- end
35
-
36
- end
37
-
38
- # Helper method to perform the tokenization.
39
- def self.get_tokens(string)
40
-
41
- # Normalize all whitespace
42
- text = string.gsub(/\s+/,' ')
43
-
44
- # Replace all decimal points by ^^
45
- Treat::Helpers::DecimalPointEscaper.escape!(text)
46
-
47
- =begin
48
-
49
- # Translate some common extended ascii
50
- # characters to quotes
51
- text.gsub!(/‘/,'`')
52
- text.gsub!(/’/,"'")
53
- text.gsub!(/“/,"``")
54
- text.gsub!(/”/,"''")
55
-
56
- # Attempt to get correct directional quotes
57
- # s{\"\b} { `` }g;
58
- text.gsub!(/\"\b/,' `` ')
59
- # s{\b\"} { '' }g;
60
- text.gsub!(/\b\"/," '' ")
61
- #s{\"(?=\s)} { '' }g;
62
- text.gsub!(/\"(?=\s)/," '' ")
63
- #s{\"} { `` }g;
64
- text.gsub!(/\"(?=\s)/," `` ")
65
- =end
66
-
67
- # Isolate ellipses
68
- # s{\.\.\.} { ... }g;
69
- text.gsub!(/\.\.\./,' ... ')
70
- # Isolate any embedded punctuation chars
71
- # s{([,;:\@\#\$\%&])} { $1 }g;
72
- text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
73
-
74
- # Assume sentence tokenization has been
75
- # done first, so split FINAL
76
- # periods only.
77
- # s/ ([^.]) \. ([\]\)\}\>\"\']*)
78
- # [ \t]* $ /$1 .$2 /gx;
79
- text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
80
- # however, we may as well split ALL
81
- # question marks and exclamation points,
82
- # since they shouldn't have the abbrev.
83
- # -marker ambiguity problem
84
- #s{([?!])} { $1 }g;
85
- text.gsub!(/([?!])/, ' \1 ')
86
- # parentheses, brackets, etc.
87
- #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
88
- text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
89
- #s/(-{2,})/ $1 /g;
90
- text.gsub!(/(-{2,})/,' \1 ')
91
-
92
- # Add a space to the beginning and end of
93
- # each line, to reduce # of regexps below.
94
- #s/$/ /;
95
- text.gsub!(/$/," ")
96
- #s/^/ /;
97
- text.gsub!(/^/," ")
98
-
99
- # possessive or close-single-quote
100
- #s/\([^\']\)\' /$1 \' /g;
101
- text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
102
- # as in it's, I'm, we'd
103
- #s/\'([smd]) / \'$1 /ig;
104
- text.gsub!(/\'([smd]) /i,%q( '\1 ))
105
- #s/\'(ll|re|ve) / \'$1 /ig;
106
- text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
107
- #s/n\'t / n\'t /ig;
108
- text.gsub!(/n\'t /i," n't ")
109
-
110
- #s/ (can)(not) / $1 $2 /ig;
111
- text.gsub!(/ (can)(not) /i,' \1 \2 ')
112
- #s/ (d\')(ye) / $1 $2 /ig;
113
- text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
114
- #s/ (gim)(me) / $1 $2 /ig;
115
- text.gsub!(/ (gim)(me) /i,' \1 \2 ')
116
- #s/ (gon)(na) / $1 $2 /ig;
117
- text.gsub!(/ (gon)(na) /i,' \1 \2 ')
118
- #s/ (got)(ta) / $1 $2 /ig;
119
- text.gsub!(/ (got)(ta) /i,' \1 \2 ')
120
- #s/ (lem)(me) / $1 $2 /ig;
121
- text.gsub!(/ (lem)(me) /i,' \1 \2 ')
122
- #s/ (more)(\'n) / $1 $2 /ig;
123
- text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
124
- #s/ (\'t)(is|was) / $1 $2 /ig;
125
- text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
126
- #s/ (wan)(na) / $1 $2 /ig;
127
- text.gsub!(/ (wan)(na) /i,' \1 \2 ')
128
- text.split(/\s/)
129
-
130
- end
131
-
132
- end