treat 1.0.4 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/LICENSE +0 -1
  2. data/files/INFO +1 -1
  3. data/lib/treat/entities/abilities/buildable.rb +2 -6
  4. data/lib/treat/entities/abilities/checkable.rb +2 -2
  5. data/lib/treat/entities/abilities/delegatable.rb +2 -2
  6. data/lib/treat/entities/abilities/doable.rb +6 -1
  7. data/lib/treat/entities/abilities/iterable.rb +8 -0
  8. data/lib/treat/entities/abilities/magical.rb +1 -1
  9. data/lib/treat/extractors.rb +1 -1
  10. data/lib/treat/formatters/visualizers/standoff.rb +1 -1
  11. data/lib/treat/groupable.rb +4 -0
  12. data/lib/treat/installer.rb +33 -13
  13. data/lib/treat/kernel.rb +0 -4
  14. data/lib/treat/languages/arabic.rb +1 -1
  15. data/lib/treat/languages/chinese.rb +1 -1
  16. data/lib/treat/languages/dutch.rb +1 -1
  17. data/lib/treat/languages/english.rb +1 -1
  18. data/lib/treat/languages/french.rb +4 -4
  19. data/lib/treat/languages/german.rb +3 -3
  20. data/lib/treat/languages/italian.rb +1 -1
  21. data/lib/treat/{linguistics/categories.rb → languages/language.rb} +3 -4
  22. data/lib/treat/languages/polish.rb +1 -1
  23. data/lib/treat/languages/portuguese.rb +1 -1
  24. data/lib/treat/languages/russian.rb +1 -1
  25. data/lib/treat/languages/spanish.rb +1 -1
  26. data/lib/treat/languages/swedish.rb +1 -1
  27. data/lib/treat/lexicalizers/categorizers/from_tag.rb +14 -10
  28. data/lib/treat/lexicalizers/taggers/brill.rb +1 -1
  29. data/lib/treat/lexicalizers/taggers/stanford.rb +5 -2
  30. data/lib/treat/lexicalizers.rb +2 -1
  31. data/lib/treat/processors/parsers/enju.rb +2 -2
  32. data/lib/treat/processors/parsers/stanford.rb +17 -11
  33. data/lib/treat/processors/segmenters/punkt.rb +5 -2
  34. data/lib/treat/processors/segmenters/tactful.rb +5 -1
  35. data/lib/treat/processors/tokenizers/ptb.rb +11 -3
  36. data/lib/treat/processors/tokenizers/punkt.rb +0 -3
  37. data/lib/treat/processors/tokenizers/tactful.rb +3 -0
  38. data/lib/treat/universalisation/encodings.rb +12 -0
  39. data/lib/treat/{linguistics → universalisation}/tags.rb +77 -46
  40. data/lib/treat/universalisation.rb +9 -0
  41. data/lib/treat.rb +2 -2
  42. metadata +6 -6
  43. data/lib/treat/linguistics.rb +0 -9
  44. data/lib/treat/processors/tokenizers/perl.rb +0 -132
data/LICENSE CHANGED
@@ -20,7 +20,6 @@ Non-trivial amount of code has been incorporated and modified from other librari
20
20
  - formatters/readers/odt.rb - Mark Watson (GPL license)
21
21
  - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
22
22
  - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
23
- - processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
24
23
  - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
25
24
  - extractors/topics/reuters.rb - Mark Watson (GPL license)
26
25
  - inflectors/declensions/english.rb - Thomas Sawyer (MIT license)
data/files/INFO CHANGED
@@ -1 +1 @@
1
- This is a folder containing the files downloaded by Treat.
1
+ This is a folder containing the files downloaded by Treat from the internet.
@@ -4,12 +4,11 @@
4
4
  # is pretty much self-explanatory.
5
5
  module Treat::Entities::Abilities::Buildable
6
6
 
7
- require 'treat/helpers/decimal_point_escaper'
8
7
  require 'fileutils'
9
8
 
10
9
  # Simple regexps to match common entities.
11
10
  WordRegexp = /^[[:alpha:]\-']+$/
12
- NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
11
+ NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
13
12
  PunctRegexp = /^[[:punct:]\$]+$/
14
13
  UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
15
14
  EmailRegexp = /.+\@.+\..+/
@@ -57,8 +56,6 @@ module Treat::Entities::Abilities::Buildable
57
56
  # instead of from_string directly).
58
57
  def from_string(string, enforce_type = false)
59
58
 
60
- Treat::Helpers::DecimalPointEscaper.escape!(string)
61
-
62
59
  enforce_type = true if caller_method == :build
63
60
 
64
61
  unless self == Treat::Entities::Entity
@@ -74,6 +71,7 @@ module Treat::Entities::Abilities::Buildable
74
71
  end
75
72
 
76
73
  e
74
+
77
75
  end
78
76
 
79
77
  # Build a document from an URL.
@@ -116,7 +114,6 @@ module Treat::Entities::Abilities::Buildable
116
114
  "a numeric object."
117
115
  end
118
116
  n = numeric.to_s
119
- Treat::Helpers::DecimalPointEscaper.unescape!(n)
120
117
  Treat::Entities::Number.new(n)
121
118
  end
122
119
 
@@ -319,7 +316,6 @@ module Treat::Entities::Abilities::Buildable
319
316
  end
320
317
 
321
318
  def create_collection(fv)
322
- debug("Creating new collection in directory #{fv}.")
323
319
  FileUtils.mkdir(fv)
324
320
  Treat::Entities::Collection.new(fv)
325
321
  end
@@ -24,8 +24,8 @@ module Treat::Entities::Abilities::Checkable
24
24
  return unless has_children?
25
25
  raise Treat::Exception,
26
26
  "Warning: can't #{caller_method(2)} "+
27
- "an entity that has children. Removing " +
28
- " all children of text \"[#{short_value}].\""
27
+ "the text \"#{short_value}\", because it " +
28
+ "already has children."
29
29
  end
30
30
 
31
31
  end
@@ -104,9 +104,9 @@ module Treat::Entities::Abilities::Delegatable
104
104
  if !klass[g] || !klass[g][0]
105
105
  d = ucc(cl(group))
106
106
  d.gsub!('_', ' ')
107
- d = 'worker to find "' + d
107
+ d = d[0..-2]
108
108
  raise Treat::Exception, "No #{d}" +
109
- "\" is available for the " +
109
+ " is available for the " +
110
110
  "#{lang.to_s.capitalize} language."
111
111
  end
112
112
  return klass[g][0]
@@ -37,8 +37,13 @@ module Treat::Entities::Abilities::Doable
37
37
  end
38
38
  if f || entity_types.include?(:entity)
39
39
  send(task, worker, options)
40
+ if group.recursive
41
+ each do |entity|
42
+ entity.do_task(task, worker, options, group)
43
+ end
44
+ end
40
45
  else
41
- each_entity(*entity_types) do |entity|
46
+ each do |entity|
42
47
  entity.do_task(task, worker, options, group)
43
48
  end
44
49
  unless entity_types.include?(type)
@@ -95,6 +95,14 @@ module Treat::Entities::Abilities::Iterable
95
95
  as
96
96
  end
97
97
 
98
+ # Returns the first ancestor that has a feature
99
+ # with the given name, otherwise nil.
100
+ def ancestor_with_feature(type, feature)
101
+ each_ancestor(type) do |ancestor|
102
+ return ancestor if ancestor.has?(feature)
103
+ end
104
+ end
105
+
98
106
  alias :ancestors_with_type :ancestors_with_types
99
107
 
100
108
  # Number of children that have a given feature.
@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
25
25
  def magic(sym, *args)
26
26
 
27
27
  @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
28
- @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
28
+ @@cats_regexp ||= "(#{Treat::Languages::Language::WordCategories.join('|')})"
29
29
 
30
30
  method = sym.to_s =~ /entities/ ?
31
31
  sym.to_s.gsub('entities', 'entitys') :
@@ -27,7 +27,7 @@ module Treat::Extractors
27
27
  module Keywords
28
28
  extend Treat::Groupable
29
29
  self.type = :annotator
30
- self.targets = [:document]
30
+ self.targets = [:document, :section, :zone]
31
31
  end
32
32
 
33
33
  # Extract clusters of topic words from a collection.
@@ -44,7 +44,7 @@ class Treat::Formatters::Visualizers::Standoff
44
44
  end
45
45
 
46
46
  def self.ptb_escape(val)
47
- Treat::Linguistics::Tags::
47
+ Treat::Universalisation::Tags::
48
48
  PTBEscapeCharacters.each do |char, esc|
49
49
  val.gsub!(char, val)
50
50
  end
@@ -95,8 +95,12 @@ module Treat::Groupable
95
95
  attr_accessor :presets
96
96
  # The preset option to use with preset functions.
97
97
  attr_accessor :preset_option
98
+ # Whether to recurse within multiple targets or not.
99
+ attr_accessor :recursive
98
100
  end
99
101
 
102
+ self.recursive = false
103
+
100
104
  # Return the method corresponding to the group.
101
105
  # This method resolves the name of the method
102
106
  # that a group should provide based on the name
@@ -82,9 +82,13 @@ module Treat::Installer
82
82
  begin
83
83
  Gem::Specification.find_by_name('punkt-segmenter')
84
84
  title "Downloading model for the Punkt segmenter for the #{l}."
85
- download_punkt_models(language)
85
+ # Need fix
86
+ download_punkt_models([language.to_s])
86
87
  rescue Gem::LoadError; end
87
-
88
+
89
+ # Download reuters models always
90
+ download_reuters_models
91
+
88
92
  # If stanford is installed, download models.
89
93
  begin
90
94
  Gem::Specification.find_by_name('stanford-core-nlp')
@@ -92,7 +96,10 @@ module Treat::Installer
92
96
  "model files for the the #{l}.\n\n"
93
97
  package = (language == :english) ? :english : :all
94
98
  download_stanford(package)
95
- rescue Gem::LoadError; end
99
+ rescue Gem::LoadError
100
+ puts 'Stanford-core-nlp gem not installed.'
101
+ puts 'Skipping download of Stanford models.'
102
+ end
96
103
 
97
104
  title "Install external binary libraries " +
98
105
  "(requires port, apt-get or win-get).\n"
@@ -124,7 +131,7 @@ module Treat::Installer
124
131
  install_dependencies(false)
125
132
  install_language_dependencies(dep, false)
126
133
  download_stanford(:minimal)
127
- download_punkt_models(:english)
134
+ download_punkt_models([:english])
128
135
  end
129
136
 
130
137
  def self.install_dependencies(optionally)
@@ -166,7 +173,7 @@ module Treat::Installer
166
173
  unless man
167
174
  puts "Skipping installation of the "+
168
175
  "following binaries:\n\n"
169
- Binaries.each do |binary, purpose|
176
+ Binary.each do |binary, purpose|
170
177
  puts "- #{binary} to #{purpose}"
171
178
  end
172
179
  return
@@ -227,22 +234,35 @@ module Treat::Installer
227
234
 
228
235
  end
229
236
 
230
- def self.download_punkt_models(language)
237
+ def self.download_punkt_models(languages)
238
+ languages.map! { |l| "#{l}.yaml" }
239
+ download_models 'punkt', languages
240
+ end
241
+
242
+ def self.download_reuters_models
243
+ files = ["industry.xml", "region.xml", "topics.xml"]
244
+ download_models 'reuters', files
245
+ end
231
246
 
232
- f = "#{language}.yaml"
233
- dest = "#{Treat.models}punkt/"
247
+ def self.download_models(directory, files)
234
248
 
235
- loc = Treat::Downloader.download(
236
- 'http', Server, 'treat/punkt', f, Treat.tmp)
249
+ dest = "#{Treat.models}#{directory}/"
237
250
 
238
251
  unless File.readable?(dest)
239
- puts "- Creating directory models/punkt ..."
252
+ puts "- Creating directory models/#{directory} ..."
240
253
  FileUtils.mkdir_p(File.absolute_path(dest))
241
254
  end
242
255
 
243
- puts "- Copying model file to models/punkt ..."
244
- FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
245
256
 
257
+ files.each do |file|
258
+ puts "- Downloading #{file} ..."
259
+ loc = Treat::Downloader.download(
260
+ 'http', Server, "treat/#{directory}", file, Treat.tmp)
261
+ puts "- Copying file to models/#{directory} ..."
262
+ FileUtils.cp(loc, File.join(Paths[:models], directory, file))
263
+ end
264
+
265
+
246
266
  puts "- Cleaning up..."
247
267
  FileUtils.rm_rf(Paths[:tmp] + Server)
248
268
 
data/lib/treat/kernel.rb CHANGED
@@ -181,10 +181,6 @@ module Kernel
181
181
  NULL_DEVICE = '/dev/null'
182
182
  end
183
183
 
184
- def debug(msg)
185
- puts msg if Treat.debug
186
- end
187
-
188
184
  def prompt(msg, valid_answers)
189
185
 
190
186
  msg = msg
@@ -6,7 +6,7 @@ class Treat::Languages::Arabic
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford]
9
+ :taggers => [:stanford]
10
10
  }
11
11
  Processors = {
12
12
  :parsers => [:stanford]
@@ -6,7 +6,7 @@ class Treat::Languages::Chinese
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford]
9
+ :taggers => [:stanford]
10
10
  }
11
11
  Processors = {
12
12
  :parsers => [:stanford]
@@ -9,7 +9,7 @@ class Treat::Languages::Dutch
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -31,7 +31,7 @@ class Treat::Languages::English
31
31
  :chunkers => [:txt],
32
32
  :parsers => [:stanford, :enju],
33
33
  :segmenters => [:tactful, :punkt, :stanford],
34
- :tokenizers => [:perl, :ptb, :stanford, :tactful, :punkt]
34
+ :tokenizers => [:ptb, :stanford, :tactful, :punkt]
35
35
  }
36
36
 
37
37
  Retrievers = {
@@ -6,14 +6,14 @@ class Treat::Languages::French
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford],
10
- :category => [:from_tag]
9
+ :taggers => [:stanford],
10
+ :categorizers => [:from_tag]
11
11
  }
12
12
  Processors = {
13
13
  :chunkers => [:txt],
14
14
  :parsers => [:stanford],
15
- :segmenters => [:punkt],
16
- :tokenizers => [:perl, :tactful]
15
+ :segmenters => [:tactful],
16
+ :tokenizers => [:tactful]
17
17
  }
18
18
  Retrievers = {}
19
19
 
@@ -6,14 +6,14 @@ class Treat::Languages::German
6
6
  Extractors = {}
7
7
  Inflectors = {}
8
8
  Lexicalizers = {
9
- :tag => [:stanford],
10
- :category => [:from_tag]
9
+ :taggers => [:stanford],
10
+ :categorizers => [:from_tag]
11
11
  }
12
12
  Processors = {
13
13
  :chunkers => [:txt],
14
14
  :parsers => [:stanford],
15
15
  :segmenters => [:punkt],
16
- :tokenizers => [:perl, :tactful]
16
+ :tokenizers => [:tactful]
17
17
  }
18
18
  Retrievers = {}
19
19
 
@@ -10,7 +10,7 @@ class Treat::Languages::Italian
10
10
  :chunkers => [:txt],
11
11
  :parsers => [:stanford],
12
12
  :segmenters => [:punkt],
13
- :tokenizers => [:perl, :tactful]
13
+ :tokenizers => [:tactful]
14
14
  }
15
15
  Retrievers = {}
16
16
 
@@ -1,11 +1,10 @@
1
- module Treat::Linguistics
2
-
3
- # A list of all possible word categories.
1
+ class Treat::Languages::Language
2
+
4
3
  WordCategories = [
5
4
  :adjective, :adverb, :noun, :verb, :interjection,
6
5
  :clitic, :coverb, :conjunction, :determiner, :particle,
7
6
  :preposition, :pronoun, :number, :symbol, :punctuation,
8
7
  :complementizer
9
8
  ]
10
-
9
+
11
10
  end
@@ -9,7 +9,7 @@ class Treat::Languages::Polish
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Portuguese
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Russian
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Spanish
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -9,7 +9,7 @@ class Treat::Languages::Swedish
9
9
  Processors = {
10
10
  :chunkers => [:txt],
11
11
  :segmenters => [:punkt],
12
- :tokenizers => [:perl, :tactful]
12
+ :tokenizers => [:tactful]
13
13
  }
14
14
  Retrievers = {}
15
15
 
@@ -3,17 +3,19 @@
3
3
  # from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
4
4
  class Treat::Lexicalizers::Categorizers::FromTag
5
5
 
6
- Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
7
- Wttc = Treat::Linguistics::Tags::WordTagToCategory
8
- Ptc = Treat::Linguistics::Tags::PunctuationToCategory
6
+ Pttc = Treat::Universalisation::Tags::PhraseTagToCategory
7
+ Wttc = Treat::Universalisation::Tags::WordTagToCategory
8
+ Ptc = Treat::Universalisation::Tags::PunctuationToCategory
9
9
 
10
10
  # Find the category of the entity from its tag.
11
11
  def self.category(entity, options = {})
12
12
 
13
13
  tag = entity.check_has(:tag)
14
+
14
15
  return :unknown if tag.nil? || tag == '' || entity.type == :symbol
15
16
  return :sentence if tag == 'S' || entity.type == :sentence
16
17
  return :number if entity.type == :number
18
+
17
19
  return Ptc[entity.to_s] if entity.type == :punctuation
18
20
 
19
21
  if entity.is_a?(Treat::Entities::Phrase)
@@ -29,15 +31,17 @@ class Treat::Lexicalizers::Categorizers::FromTag
29
31
 
30
32
  if entity.has?(:tag_set)
31
33
  ts = entity.get(:tag_set)
32
- elsif entity.parent_phrase &&
33
- entity.parent_phrase.has?(:tag_set)
34
- ts = entity.parent_phrase.get(:tag_set)
35
34
  else
36
- raise Treat::Exception,
37
- "No information can be found regarding "+
38
- "which tag set to use."
35
+ a = entity.ancestor_with_feature(:phrase, :tag_set)
36
+ if a
37
+ ts = a.get(:tag_set)
38
+ else
39
+ raise Treat::Exception,
40
+ "No information can be found regarding "+
41
+ "which tag set to use."
42
+ end
39
43
  end
40
-
44
+
41
45
  if cat[ts]
42
46
  return cat[ts]
43
47
  else
@@ -35,7 +35,7 @@ module Treat::Lexicalizers::Taggers::Brill
35
35
  # Tokenize the sentence/phrase.
36
36
  if !entity.has_children? &&
37
37
  !entity.is_a?(Treat::Entities::Token)
38
- entity.tokenize(:perl, options)
38
+ entity.tokenize(options)
39
39
  end
40
40
 
41
41
  # Create the tagger if necessary
@@ -38,11 +38,14 @@ class Treat::Lexicalizers::Taggers::Stanford
38
38
  end
39
39
 
40
40
  # Handle tags for sentences and phrases.
41
-
42
41
  if entity.is_a?(Treat::Entities::Sentence) ||
43
42
  (entity.is_a?(Treat::Entities::Phrase) &&
44
43
  !entity.parent_sentence)
45
- entity.set :tag_set, :penn
44
+
45
+ tag_set = Treat::Universalisation::Tags::
46
+ StanfordTagSetForLanguage[
47
+ Treat::Languages.describe(lang)]
48
+ entity.set :tag_set, tag_set
46
49
  end
47
50
 
48
51
  if entity.is_a?(Treat::Entities::Sentence)
@@ -16,7 +16,8 @@ module Treat::Lexicalizers
16
16
  module Categorizers
17
17
  extend Treat::Groupable
18
18
  self.type = :annotator
19
- self.targets = [:token]
19
+ self.targets = [:sentence, :phrase, :token]
20
+ self.recursive = true
20
21
  self.default = :from_tag
21
22
  end
22
23
 
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
23
23
  @@parser = nil
24
24
 
25
25
  # A hash of Enju cat tags mapped to word categories.
26
- Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
26
+ Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
27
27
 
28
28
  # A hash of Enju cat/xcat pairs mapped to PTB tags.
29
- Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
29
+ Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
30
30
 
31
31
  # Parse the entity into its syntactical
32
32
  # phrases using Enju.
@@ -27,6 +27,10 @@ class Treat::Processors::Parsers::Stanford
27
27
  lang = entity.language
28
28
  init(lang, options)
29
29
 
30
+ tag_set = Treat::Universalisation::Tags::
31
+ StanfordTagSetForLanguage[
32
+ Treat::Languages.describe(lang)]
33
+
30
34
  text = ::StanfordCoreNLP::Text.new(val)
31
35
  @@parsers[lang].annotate(text)
32
36
 
@@ -37,17 +41,18 @@ class Treat::Processors::Parsers::Stanford
37
41
  tag = s.get(:category).to_s
38
42
  tag_s, tag_opt = *tag.split('-')
39
43
  tag_s ||= 'S'
40
- entity.set :tag_set, :penn
41
44
  entity.set :tag, tag_s
42
45
  entity.set :tag_opt, tag_opt if tag_opt
43
- recurse(s.get(:tree).children[0], entity)
44
- break
46
+ recurse(s.get(:tree).children[0], entity, tag_set)
47
+ break #######
45
48
  else
46
49
  recurse(s.get(:tree), entity)
47
50
  end
48
51
 
49
52
  end
50
53
 
54
+ entity.set :tag_set, tag_set
55
+
51
56
  end
52
57
 
53
58
  def self.init(lang, options)
@@ -76,7 +81,7 @@ class Treat::Processors::Parsers::Stanford
76
81
 
77
82
  # Helper method which recurses the tree supplied by
78
83
  # the Stanford parser.
79
- def self.recurse(java_node, ruby_node, additional_tags = [])
84
+ def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
80
85
 
81
86
  if java_node.num_children == 0
82
87
 
@@ -85,10 +90,8 @@ class Treat::Processors::Parsers::Stanford
85
90
  tag_s, tag_opt = *tag.split('-')
86
91
  tag_s ||= ''
87
92
  ruby_node.value = java_node.value.to_s.strip
88
- ruby_node.set :tag_set, :penn
89
93
  ruby_node.set :tag, tag_s
90
94
  ruby_node.set :tag_opt, tag_opt if tag_opt
91
- ruby_node.set :tag_set, :penn
92
95
  ruby_node.set :lemma, label.get(:lemma).to_s
93
96
 
94
97
  additional_tags.each do |t|
@@ -103,33 +106,35 @@ class Treat::Processors::Parsers::Stanford
103
106
  if java_node.num_children == 1 &&
104
107
  java_node.children[0].num_children == 0
105
108
  recurse(java_node.children[0],
106
- ruby_node, additional_tags)
109
+ ruby_node, tag_set, additional_tags)
107
110
  return
108
111
  end
109
112
 
110
113
  java_node.children.each do |java_child|
114
+
111
115
  label = java_child.label
112
116
  tag = label.get(:category).to_s
113
117
  tag_s, tag_opt = *tag.split('-')
114
118
  tag_s ||= ''
115
-
116
- if Treat::Linguistics::Tags::PhraseTagToCategory[tag_s]
119
+
120
+ if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
121
+ Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
117
122
  ruby_child = Treat::Entities::Phrase.new
118
123
  else
119
124
  l = java_child.children[0].to_s
120
125
  v = java_child.children[0].value.to_s.strip
126
+
121
127
  # Mhmhmhmhmhm
122
128
  val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
123
129
  ruby_child = Treat::Entities::Token.from_string(val)
124
130
  end
125
131
 
126
- ruby_child.set :tag_set, :penn
127
132
  ruby_child.set :tag, tag_s
128
133
  ruby_child.set :tag_opt, tag_opt if tag_opt
129
134
  ruby_node << ruby_child
130
135
 
131
136
  unless java_child.children.empty?
132
- recurse(java_child, ruby_child, additional_tags)
137
+ recurse(java_child, ruby_child, tag_set, additional_tags)
133
138
  end
134
139
 
135
140
  end
@@ -137,4 +142,5 @@ class Treat::Processors::Parsers::Stanford
137
142
  end
138
143
 
139
144
  end
145
+
140
146
  end
@@ -38,15 +38,18 @@ module Treat::Processors::Segmenters::Punkt
38
38
 
39
39
  s = entity.to_s
40
40
 
41
- # Replace all decimal points by ^^
41
+ # Replace the point in all floating-point numbers
42
+ # by ^^; this is a fix since Punkt trips on decimal
43
+ # numbers.
42
44
  Treat::Helpers::DecimalPointEscaper.escape!(s)
43
- s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
45
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
44
46
 
45
47
  result = @@segmenters[lang].
46
48
  sentences_from_text(s,
47
49
  :output => :sentences_text)
48
50
 
49
51
  result.each do |sentence|
52
+ # Unescape the sentence.
50
53
  Treat::Helpers::DecimalPointEscaper.
51
54
  unescape!(sentence)
52
55
  entity << Treat::Entities::Phrase.
@@ -29,17 +29,21 @@ module Treat::Processors::Segmenters::Tactful
29
29
  entity.check_hasnt_children
30
30
 
31
31
  s = entity.to_s
32
+
32
33
  Treat::Helpers::DecimalPointEscaper.escape!(s)
33
34
 
34
- s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
35
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
35
36
 
36
37
  @@segmenter ||= TactfulTokenizer::Model.new
37
38
 
38
39
  sentences = @@segmenter.tokenize_text(s)
40
+
39
41
  sentences.each do |sentence|
40
42
  Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
43
+ puts sentence.to_s if sentence.to_s.include?('staff')
41
44
  entity << Treat::Entities::Phrase.from_string(sentence)
42
45
  end
46
+
43
47
  end
44
48
 
45
49
  end
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  # A native rule-basd tokenizer based on the one
2
3
  # developped by Robert Macyntyre in 1995 for the Penn
3
4
  # Treebank project. This tokenizer follows the
@@ -11,8 +12,6 @@
11
12
  # you can redistribute it and/or modify it under the
12
13
  # same terms as Ruby itself.
13
14
  module Treat::Processors::Tokenizers::PTB
14
-
15
- require 'treat/helpers/decimal_point_escaper'
16
15
 
17
16
  # Tokenize the entity using a native rule-based algorithm.
18
17
  def self.tokenize(entity, options = {})
@@ -33,8 +32,17 @@ module Treat::Processors::Tokenizers::PTB
33
32
 
34
33
  # Helper method to split the string into tokens.
35
34
  def self.split(string)
35
+
36
36
  s = " " + string + " "
37
- Treat::Helpers::DecimalPointEscaper.escape!(s)
37
+
38
+ # Translate some common extended ascii
39
+ # characters to quotes
40
+ s.gsub!(/‘/,'`')
41
+ s.gsub!(/’/,"'")
42
+ s.gsub!(/“/,"``")
43
+ s.gsub!(/”/,"''")
44
+
45
+
38
46
  s.gsub!(/\s+/," ")
39
47
  s.gsub!(/(\s+)''/,'\1"')
40
48
  s.gsub!(/(\s+)``/,'\1"')
@@ -14,8 +14,6 @@
14
14
  # Project website: https://github.com/lfcipriani/punkt-segmenter
15
15
  class Treat::Processors::Tokenizers::Punkt
16
16
 
17
- require 'treat/helpers/decimal_point_escaper'
18
-
19
17
  SentEndChars = ['.', '?', '!']
20
18
  ReSentEndChars = /[.?!]/
21
19
  InternalPunctuation = [',', ':', ';']
@@ -35,7 +33,6 @@ class Treat::Processors::Tokenizers::Punkt
35
33
  entity.check_hasnt_children
36
34
 
37
35
  s = entity.to_s
38
- Treat::Helpers::DecimalPointEscaper.escape!(s)
39
36
 
40
37
  s.scan(ReWordTokenizer).each do |token|
41
38
  if SentEndChars.include?(token[-1])
@@ -51,6 +51,7 @@ class Treat::Processors::Tokenizers::Tactful
51
51
  entity.check_hasnt_children
52
52
 
53
53
  s = entity.to_s
54
+
54
55
  Treat::Helpers::DecimalPointEscaper.escape!(s)
55
56
 
56
57
  ReTokenize.each do |rules|
@@ -58,6 +59,8 @@ class Treat::Processors::Tokenizers::Tactful
58
59
  end
59
60
 
60
61
  s.split(' ').each do |token|
62
+
63
+ Treat::Helpers::DecimalPointEscaper.unescape!(token)
61
64
  entity << Treat::Entities::Token.
62
65
  from_string(token)
63
66
  end
@@ -0,0 +1,12 @@
1
+ module Treat::Universalisation
2
+
3
+ Encodings = {
4
+ :arabic => 'UTF-8',
5
+ :chinese => 'GB18030',
6
+ :english => 'UTF-8',
7
+ :french => 'ISO_8859-1',
8
+ :german => 'ISO_8859-1',
9
+ :hebrew => 'UTF-8'
10
+ }
11
+
12
+ end
@@ -1,14 +1,20 @@
1
- module Treat::Linguistics::Tags
1
+ module Treat::Universalisation::Tags
2
2
 
3
3
  ClawsC5 = 0
4
4
  Brown = 1
5
5
  Penn = 2
6
- Negra = 3
6
+ Stuttgart = 3
7
7
  PennChinese = 4
8
- Simple = 5
8
+ Paris7 = 5
9
+
10
+ StanfordTagSetForLanguage = {
11
+ :french => :paris7,
12
+ :english => :penn,
13
+ :german => :stuttgart
14
+ }
9
15
 
10
16
  PTBClauseTagDescription = [
11
- ['S', 'Simple declarative clause'],
17
+ ['S', 'Paris7 declarative clause'],
12
18
  ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
13
19
  ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
14
20
  ['SINV', 'Inverted declarative sentence'],
@@ -26,30 +32,33 @@ module Treat::Linguistics::Tags
26
32
 
27
33
  AlignedPhraseTags =
28
34
  [
29
- 'Adjective phrase', ['', '', 'ADJP'],
30
- 'Adverb phrase', ['', '', 'ADVP'],
31
- 'Conjunction phrase', ['', '', 'CONJP'],
32
- 'Fragment', ['', '', 'FRAG'],
33
- 'Interjection', ['', '', 'INTJ'],
34
- 'List marker', ['', '', 'LST'],
35
- 'Not a phrase', ['', '', 'NAC'],
36
- 'Noun phrase', ['', '', 'NP'],
37
- 'Head of NP', ['', '', 'NX'],
38
- 'Prepositional phrase', ['', '', 'PP'],
39
- 'Parenthetical', ['', '', 'PRN'],
40
- 'Particle', ['', '', 'PRT'],
41
- 'Quantifier phrase', ['', '', 'QP'],
42
- 'Reduced relative clause', ['', '', 'RRC'],
43
- 'Unlike coordinated phrase', ['', '', 'UCP'],
44
- 'Verb phrase', ['', '', 'VP'],
45
- 'Wh adjective phrase', ['', '', 'WHADJP'],
46
- 'Wh adverb phrase', ['', '', 'WHAVP'],
47
- 'Wh noun phrase', ['', '', 'WHNP'],
48
- 'Wh prepositional phrase', ['', '', 'WHPP'],
49
- 'Unknown', ['', '', 'X'],
50
- 'Phrase', ['', '', 'P'],
51
- 'Sentence', ['', '', 'S'],
52
- 'Phrase', ['', '', 'SBAR'] # Fix
35
+ 'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
36
+ 'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
37
+ 'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
38
+ 'Fragment', ['', '', 'FRAG', '', '', ''],
39
+ 'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
40
+ 'List marker', ['', '', 'LST', '', '', ''],
41
+ 'Not a phrase', ['', '', 'NAC', '', '', ''],
42
+ 'Noun phrase', ['', '', 'NP', '', '', 'NP'],
43
+ 'Verbal nucleus', ['', '', '', '', '', 'VN'],
44
+ 'Head of noun phrase', ['', '', 'NX', '', '', ''],
45
+ 'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
46
+ 'Parenthetical', ['', '', 'PRN', '', '', ''],
47
+ 'Particle', ['', '', 'PRT', '', '', ''],
48
+ 'Participial phrase', ['', '', '', '', '', 'VPart'],
49
+ 'Quantifier phrase', ['', '', 'QP', '', '', ''],
50
+ 'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
51
+ 'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
52
+ 'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
53
+ 'Verb phrase', ['', '', 'VP', '', '', ''],
54
+ 'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
55
+ 'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
56
+ 'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
57
+ 'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
58
+ 'Unknown', ['', '', 'X', '', '', ''],
59
+ 'Phrase', ['', '', 'P', '', '', 'Sint'],
60
+ 'Sentence', ['', '', 'S', '', '', 'SENT'],
61
+ 'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
53
62
  ]
54
63
 
55
64
  # A description of Enju categories.
@@ -139,12 +148,12 @@ module Treat::Linguistics::Tags
139
148
  # JRS?
140
149
 
141
150
 
142
- SimpleWordTagToCategory = {
151
+ Paris7WordTagToCategory = {
143
152
  'C' => :complementizer,
144
153
  'PN' => :punctuation,
145
154
  'SC' => :conjunction
146
155
  }
147
-
156
+
148
157
  PunctuationToCategory = {
149
158
  '.' => :period,
150
159
  ',' => :comma,
@@ -152,9 +161,8 @@ module Treat::Linguistics::Tags
152
161
  ':' => :colon,
153
162
  '!' => :exclamation,
154
163
  '?' => :interrogation,
155
- '"' => :quote,
156
- "'" => :quote,
157
-
164
+ '"' => :double_quote,
165
+ "'" => :single_quote,
158
166
  '$' => :dollar,
159
167
  '%' => :percent,
160
168
  '#' => :hash,
@@ -227,7 +235,7 @@ module Treat::Linguistics::Tags
227
235
  'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
228
236
  'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
229
237
  'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
230
-
238
+ 'Interjection', ['', '', '', '', '', 'I'],
231
239
  'Localizer', ['', '', '', '', 'LC'],
232
240
 
233
241
  'Measure word', ['', '', '', '', 'M'],
@@ -366,11 +374,25 @@ module Treat::Linguistics::Tags
366
374
  'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
367
375
  'Verb, ????', ['', '', '', '', 'VC'] # ?
368
376
  ]
369
-
370
- wttc = {
371
-
372
- }
373
- Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
377
+
378
+ # Paris7 Treebank functional tags
379
+ =begin
380
+ SUJ (subject)
381
+ OBJ (direct object)
382
+ ATS (predicative complement of a subject)
383
+ ATO (predicative complement of a direct object)
384
+ MOD (modifier or adjunct)
385
+ A-OBJ (indirect complement introduced by à)
386
+ DE-OBJ (indirect complement introduced by de)
387
+ P-OBJ (indirect complement introduced by another preposition)
388
+ =end
389
+
390
+ # !! Extremely ugly code follows.
391
+
392
+ # Generate word tag -> category hash.
393
+ wttc = {}
394
+
395
+ Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
374
396
 
375
397
  category = desc.gsub(',', ' ,').
376
398
  split(' ')[0].downcase.intern
@@ -378,32 +400,41 @@ module Treat::Linguistics::Tags
378
400
  wttc[tags[ClawsC5]] ||= {}
379
401
  wttc[tags[Brown]] ||= {}
380
402
  wttc[tags[Penn]] ||= {}
381
- wttc[tags[Negra]] ||= {}
403
+ wttc[tags[Stuttgart]] ||= {}
382
404
  wttc[tags[PennChinese]] ||= {}
383
- wttc[tags[Simple]] ||= {}
405
+ wttc[tags[Paris7]] ||= {}
384
406
 
385
407
  wttc[tags[ClawsC5]][:claws_5] = category
386
408
  wttc[tags[Brown]][:brown] = category
387
409
  wttc[tags[Penn]][:penn] = category
388
- wttc[tags[Negra]][:negra] = category if tags[Negra]
410
+ wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
389
411
  wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
390
- wttc[tags[Simple]][:simple] = category if tags[Simple]
412
+ wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
391
413
 
392
414
  end
415
+
393
416
  # A hash converting word tags to word categories.
394
417
  WordTagToCategory = wttc
395
418
 
396
419
  # A hash converting phrase tag to categories.
397
420
  pttc = {}
398
- Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
421
+
422
+ Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
423
+
399
424
  category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
425
+
400
426
  pttc[tags[Penn]] ||= {};
427
+ pttc[tags[Paris7]] ||= {};
428
+
429
+ pttc[tags[Penn]][:penn] = category
430
+ pttc[tags[Paris7]][:paris7] = category
431
+
401
432
  # Not yet for other tag sts.
402
433
  #pttc[tags[0]][:claws_5] = category
403
434
  #pttc[tags[1]][:brown] = category
404
- pttc[tags[Penn]][:penn] = category
435
+
405
436
  end
406
-
437
+
407
438
  # A hash converting word tags to word categories.
408
439
  PhraseTagToCategory = pttc
409
440
 
@@ -0,0 +1,9 @@
1
+ module Treat::Universalisation
2
+
3
+ p = 'treat/universalisation/*.rb'
4
+
5
+ Dir[Treat.lib + p].each do |f|
6
+ require f
7
+ end
8
+
9
+ end
data/lib/treat.rb CHANGED
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.4"
13
+ VERSION = "1.0.5"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
@@ -44,7 +44,7 @@ module Treat
44
44
  require 'treat/kernel'
45
45
  require 'treat/downloader'
46
46
  require 'treat/languages'
47
- require 'treat/linguistics'
47
+ require 'treat/universalisation'
48
48
  require 'treat/entities'
49
49
  require 'treat/categories'
50
50
  require 'treat/data_set'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-20 00:00:00.000000000 Z
12
+ date: 2012-05-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip
@@ -161,6 +161,7 @@ files:
161
161
  - lib/treat/languages/german.rb
162
162
  - lib/treat/languages/greek.rb
163
163
  - lib/treat/languages/italian.rb
164
+ - lib/treat/languages/language.rb
164
165
  - lib/treat/languages/list.txt
165
166
  - lib/treat/languages/polish.rb
166
167
  - lib/treat/languages/portuguese.rb
@@ -176,9 +177,6 @@ files:
176
177
  - lib/treat/lexicalizers/taggers/lingua.rb
177
178
  - lib/treat/lexicalizers/taggers/stanford.rb
178
179
  - lib/treat/lexicalizers.rb
179
- - lib/treat/linguistics/categories.rb
180
- - lib/treat/linguistics/tags.rb
181
- - lib/treat/linguistics.rb
182
180
  - lib/treat/loaders/linguistics.rb
183
181
  - lib/treat/loaders/stanford.rb
184
182
  - lib/treat/object.rb
@@ -190,7 +188,6 @@ files:
190
188
  - lib/treat/processors/segmenters/punkt.rb
191
189
  - lib/treat/processors/segmenters/stanford.rb
192
190
  - lib/treat/processors/segmenters/tactful.rb
193
- - lib/treat/processors/tokenizers/perl.rb
194
191
  - lib/treat/processors/tokenizers/ptb.rb
195
192
  - lib/treat/processors/tokenizers/punkt.rb
196
193
  - lib/treat/processors/tokenizers/stanford.rb
@@ -202,6 +199,9 @@ files:
202
199
  - lib/treat/retrievers.rb
203
200
  - lib/treat/server.rb
204
201
  - lib/treat/tree.rb
202
+ - lib/treat/universalisation/encodings.rb
203
+ - lib/treat/universalisation/tags.rb
204
+ - lib/treat/universalisation.rb
205
205
  - lib/treat.rb
206
206
  - spec/collection.rb
207
207
  - spec/document.rb
@@ -1,9 +0,0 @@
1
- module Treat::Linguistics
2
-
3
- p = 'treat/linguistics/*.rb'
4
-
5
- Dir[Treat.lib + p].each do |f|
6
- require f
7
- end
8
-
9
- end
@@ -1,132 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- # Tokenize the entity using a native rule-based
4
- # algorithm. This tokenizer is a port from an
5
- # unknown Perl module, which I have lifted from
6
- # the 'rbtagger' gem.
7
- #
8
- # Author: Todd A. Fisher
9
- #
10
- # This code is free to use under the terms of
11
- # the MIT license.
12
- #
13
- # Original project website:
14
- #
15
- # https://github.com/taf2/rb-brill-tagger
16
- module Treat::Processors::Tokenizers::Perl
17
-
18
- require 'treat/helpers/decimal_point_escaper'
19
-
20
- # Tokenize the entity using a rule-based algorithm
21
- # ported from Perl by Todd A. Fisher.
22
- #
23
- # Options: none.
24
- def self.tokenize(entity, options = {})
25
-
26
- entity.check_hasnt_children
27
- s = entity.to_s
28
-
29
- tokens = get_tokens(entity.to_s)
30
- tokens[1..-1].each do |token|
31
- next if token =~ /^\s*$/
32
- entity << Treat::Entities::Token.
33
- from_string(token)
34
- end
35
-
36
- end
37
-
38
- # Helper method to perform the tokenization.
39
- def self.get_tokens(string)
40
-
41
- # Normalize all whitespace
42
- text = string.gsub(/\s+/,' ')
43
-
44
- # Replace all decimal points by ^^
45
- Treat::Helpers::DecimalPointEscaper.escape!(text)
46
-
47
- =begin
48
-
49
- # Translate some common extended ascii
50
- # characters to quotes
51
- text.gsub!(/‘/,'`')
52
- text.gsub!(/’/,"'")
53
- text.gsub!(/“/,"``")
54
- text.gsub!(/”/,"''")
55
-
56
- # Attempt to get correct directional quotes
57
- # s{\"\b} { `` }g;
58
- text.gsub!(/\"\b/,' `` ')
59
- # s{\b\"} { '' }g;
60
- text.gsub!(/\b\"/," '' ")
61
- #s{\"(?=\s)} { '' }g;
62
- text.gsub!(/\"(?=\s)/," '' ")
63
- #s{\"} { `` }g;
64
- text.gsub!(/\"(?=\s)/," `` ")
65
- =end
66
-
67
- # Isolate ellipses
68
- # s{\.\.\.} { ... }g;
69
- text.gsub!(/\.\.\./,' ... ')
70
- # Isolate any embedded punctuation chars
71
- # s{([,;:\@\#\$\%&])} { $1 }g;
72
- text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
73
-
74
- # Assume sentence tokenization has been
75
- # done first, so split FINAL
76
- # periods only.
77
- # s/ ([^.]) \. ([\]\)\}\>\"\']*)
78
- # [ \t]* $ /$1 .$2 /gx;
79
- text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
80
- # however, we may as well split ALL
81
- # question marks and exclamation points,
82
- # since they shouldn't have the abbrev.
83
- # -marker ambiguity problem
84
- #s{([?!])} { $1 }g;
85
- text.gsub!(/([?!])/, ' \1 ')
86
- # parentheses, brackets, etc.
87
- #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
88
- text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
89
- #s/(-{2,})/ $1 /g;
90
- text.gsub!(/(-{2,})/,' \1 ')
91
-
92
- # Add a space to the beginning and end of
93
- # each line, to reduce # of regexps below.
94
- #s/$/ /;
95
- text.gsub!(/$/," ")
96
- #s/^/ /;
97
- text.gsub!(/^/," ")
98
-
99
- # possessive or close-single-quote
100
- #s/\([^\']\)\' /$1 \' /g;
101
- text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
102
- # as in it's, I'm, we'd
103
- #s/\'([smd]) / \'$1 /ig;
104
- text.gsub!(/\'([smd]) /i,%q( '\1 ))
105
- #s/\'(ll|re|ve) / \'$1 /ig;
106
- text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
107
- #s/n\'t / n\'t /ig;
108
- text.gsub!(/n\'t /i," n't ")
109
-
110
- #s/ (can)(not) / $1 $2 /ig;
111
- text.gsub!(/ (can)(not) /i,' \1 \2 ')
112
- #s/ (d\')(ye) / $1 $2 /ig;
113
- text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
114
- #s/ (gim)(me) / $1 $2 /ig;
115
- text.gsub!(/ (gim)(me) /i,' \1 \2 ')
116
- #s/ (gon)(na) / $1 $2 /ig;
117
- text.gsub!(/ (gon)(na) /i,' \1 \2 ')
118
- #s/ (got)(ta) / $1 $2 /ig;
119
- text.gsub!(/ (got)(ta) /i,' \1 \2 ')
120
- #s/ (lem)(me) / $1 $2 /ig;
121
- text.gsub!(/ (lem)(me) /i,' \1 \2 ')
122
- #s/ (more)(\'n) / $1 $2 /ig;
123
- text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
124
- #s/ (\'t)(is|was) / $1 $2 /ig;
125
- text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
126
- #s/ (wan)(na) / $1 $2 /ig;
127
- text.gsub!(/ (wan)(na) /i,' \1 \2 ')
128
- text.split(/\s/)
129
-
130
- end
131
-
132
- end