treat 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,4 +60,4 @@ class Treat::Classification
60
60
  line
61
61
  end
62
62
 
63
- end
63
+ end
@@ -1,12 +1,12 @@
1
1
  class Treat::Dependencies
2
2
 
3
3
  Gem = [
4
- ['psych', '1.2.2', '(un)serialize annotated entities to YAML format'],
5
- ['nokogiri', '>= 1.4.0', 'read and parse XML and HTML formats'],
4
+ ['psych', '>= 1.2.2', '(un)serialize annotated entities to YAML format'],
5
+ ['nokogiri', '>= 1.5.2', 'read and parse XML and HTML formats'],
6
6
  ['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
7
7
  ['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
8
- ['ruby-readability', '>= 0.5.0', 'extract the readable content from HTML pages'],
9
- ['stanford-core-nlp', '>= 0.1.8', 'tokenize, segment, parse texts and perform named entity recognition'],
8
+ ['ruby-readability', '>= 0.5.2', 'extract the readable content from HTML pages'],
9
+ ['stanford-core-nlp', '>= 0.3.0', 'tokenize, segment, parse texts and perform named entity recognition'],
10
10
  ['whatlanguage', '>= 1.0.0', 'detect the language of text'],
11
11
  ['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
12
12
  ['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
@@ -13,7 +13,8 @@ module Treat::Entities::Abilities::Buildable
13
13
  PunctRegexp = /^[[:punct:]\$]+$/
14
14
  UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
15
15
  EmailRegexp = /.+\@.+\..+/
16
-
16
+ Enclitics = %w['ll 'm 're 's 't 've]
17
+
17
18
  # Reserved folder names
18
19
  Reserved = ['.index']
19
20
 
@@ -264,8 +265,8 @@ module Treat::Entities::Abilities::Buildable
264
265
  def token_from_string(string)
265
266
 
266
267
  check_encoding(string)
267
- if string == "'s" || string == "'S"
268
- Treat::Entities::Clitic.new(string)
268
+ if Enclitics.include?(string.downcase)
269
+ Treat::Entities::Enclitic.new(string)
269
270
  elsif string =~ WordRegexp &&
270
271
  string.count(' ') == 0 &&
271
272
  string != '-'
@@ -30,7 +30,7 @@ module Treat::Entities::Abilities::Iterable
30
30
  a = []
31
31
  type = :entity unless type
32
32
  each_entity(type) do |e|
33
- r = e.send(feature)
33
+ r = e.get(feature)
34
34
  a << e if r == value
35
35
  end
36
36
  a
@@ -69,7 +69,7 @@ module Treat::Entities::Abilities::Stringable
69
69
 
70
70
  if child.is_a?(Treat::Entities::Token) || child.value != ''
71
71
  if child.is_a?(Treat::Entities::Punctuation) ||
72
- child.is_a?(Treat::Entities::Clitic)
72
+ child.is_a?(Treat::Entities::Enclitic)
73
73
  value.strip!
74
74
  end
75
75
  value += child.to_s + ' '
@@ -76,7 +76,7 @@ module Treat::Entities
76
76
  end
77
77
 
78
78
  # Represents a clitic ('s).
79
- class Clitic < Token; end
79
+ class Enclitic < Token; end
80
80
 
81
81
  # Represents a number.
82
82
  class Number < Token
@@ -4,7 +4,8 @@
4
4
  class Treat::Extractors::NameTag::Stanford
5
5
 
6
6
  require 'treat/loaders/stanford'
7
-
7
+ Treat::Loaders::Stanford.load
8
+
8
9
  @@classifiers = {}
9
10
 
10
11
  def self.name_tag(entity, options = {})
@@ -12,9 +13,10 @@ class Treat::Extractors::NameTag::Stanford
12
13
  pp = nil
13
14
 
14
15
  lang = entity.language
15
-
16
+
16
17
  language = Treat::Languages.describe(lang)
17
-
18
+ Treat::Loaders::Stanford.load(language)
19
+
18
20
  isolated_token = entity.is_a?(Treat::Entities::Token)
19
21
  tokens = isolated_token ? [entity] : entity.tokens
20
22
 
@@ -1,6 +1,7 @@
1
1
  class Treat::Formatters::Readers::XML
2
2
 
3
3
  require 'treat/loaders/stanford'
4
+ Treat::Loaders::Stanford.load
4
5
  require 'cgi'
5
6
 
6
7
  # By default, don't backup the XML
@@ -32,7 +32,7 @@ class Treat::Formatters::Serializers::Mongo
32
32
  coll = @@db[type_id[0]][type_id[1]]
33
33
  end
34
34
 
35
- # Store path
35
+ # Store path
36
36
 
37
37
  Treat::Entities.list.each do |type|
38
38
 
@@ -22,6 +22,7 @@ class Treat::Lexicalizers::Taggers::Stanford
22
22
 
23
23
  # Handle options and initialize the tagger.
24
24
  lang = entity.language
25
+
25
26
  options = get_options(options, lang)
26
27
  tokens, list = get_token_list(entity)
27
28
  init_tagger(lang)
@@ -55,14 +56,13 @@ class Treat::Lexicalizers::Taggers::Stanford
55
56
 
56
57
  # Initialize the tagger for a language.
57
58
  def self.init_tagger(lang)
58
-
59
59
  language = Treat::Languages.describe(lang)
60
+ Treat::Loaders::Stanford.load(language)
60
61
  model = StanfordCoreNLP::Config::Models[:pos][language]
61
- model = Treat.models + 'stanford/' +
62
+ model = Treat::Loaders::Stanford.model_path +
62
63
  StanfordCoreNLP::Config::ModelFolders[:pos] + model
63
64
  @@taggers[lang] ||=
64
65
  StanfordCoreNLP::MaxentTagger.new(model)
65
-
66
66
  end
67
67
 
68
68
  # Handle the options for the tagger.
@@ -1,26 +1,30 @@
1
- class Treat::Loaders
1
+ module Treat
2
2
 
3
- # A helper class to load a language class
4
- # registered with the Linguistics gem.
5
- class Linguistics
3
+ module Loaders
6
4
 
7
- silence_warnings { require 'linguistics' }
8
- @@languages = {}
5
+ # A helper class to load a language class
6
+ # registered with the Linguistics gem.
7
+ class Linguistics
9
8
 
10
- def self.load(language)
11
- if @@languages[language]
12
- return @@languages[language]
13
- end
14
- begin
15
- l = language.to_s.upcase
16
- silence_warnings do
17
- @@languages[language] =
18
- ::Linguistics.const_get(l)
9
+ silence_warnings { require 'linguistics' }
10
+ @@languages = {}
11
+
12
+ def self.load(language)
13
+ if @@languages[language]
14
+ return @@languages[language]
19
15
  end
20
- rescue RuntimeError
21
- raise "Ruby Linguistics does " +
22
- "not have a module installed " +
23
- "for the #{language} language."
16
+ begin
17
+ l = language.to_s.upcase
18
+ silence_warnings do
19
+ @@languages[language] =
20
+ ::Linguistics.const_get(l)
21
+ end
22
+ rescue RuntimeError
23
+ raise "Ruby Linguistics does " +
24
+ "not have a module installed " +
25
+ "for the #{language} language."
26
+ end
27
+
24
28
  end
25
29
 
26
30
  end
@@ -1,27 +1,45 @@
1
- class Treat::Loaders
1
+ module Treat
2
+
3
+ module Loaders
2
4
 
3
- # A helper class to load a language class
4
- # registered with the Linguistics gem.
5
- class Stanford
5
+ class Stanford
6
6
 
7
- require 'stanford-core-nlp'
8
-
9
- StanfordCoreNLP.jar_path =
10
- Treat.bin + 'stanford/'
11
-
12
- StanfordCoreNLP.model_path =
13
- Treat.models + 'stanford/'
14
-
15
- StanfordCoreNLP.use(
16
- Treat::Languages.describe(
17
- Treat.default_language))
7
+ require 'stanford-core-nlp'
18
8
 
19
- StanfordCoreNLP.log_file =
20
- NULL_DEVICE if Treat.silence
21
-
22
- StanfordCoreNLP.bind
23
- @@loaded = true
9
+ class << self
10
+ attr_accessor :jar_path
11
+ attr_accessor :model_path
12
+ attr_accessor :loaded
13
+ end
14
+
15
+ self.jar_path = Treat.bin + 'stanford/'
16
+ self.model_path = Treat.models + 'stanford/'
17
+ self.loaded = false
18
+
19
+ def self.load(language = nil)
20
+
21
+ return if self.loaded
22
+
23
+ language ||=
24
+ Treat::Languages.describe(
25
+ Treat.default_language)
26
+
27
+ StanfordCoreNLP.jar_path = self.jar_path
28
+ StanfordCoreNLP.model_path = self.model_path
29
+
30
+ StanfordCoreNLP.use(language)
31
+
32
+ StanfordCoreNLP.log_file =
33
+ NULL_DEVICE if Treat.silence
34
+
35
+ StanfordCoreNLP.bind
36
+
37
+ self.loaded = true
38
+
39
+ end
40
+
41
+ end
24
42
 
25
43
  end
26
-
27
- end
44
+
45
+ end
@@ -26,7 +26,7 @@ class Treat::Processors::Parsers::Stanford
26
26
  val = entity.to_s
27
27
  lang = entity.language
28
28
  init(lang, options)
29
-
29
+
30
30
  text = ::StanfordCoreNLP::Text.new(val)
31
31
  @@parsers[lang].annotate(text)
32
32
 
@@ -52,6 +52,10 @@ class Treat::Processors::Parsers::Stanford
52
52
 
53
53
  def self.init(lang, options)
54
54
  return if @@parsers[lang]
55
+
56
+ language = Treat::Languages.describe(lang)
57
+ Treat::Loaders::Stanford.load(language)
58
+
55
59
  options = DefaultOptions.merge(options)
56
60
  StanfordCoreNLP.use(lang)
57
61
  if options[:tagger_model]
@@ -3,7 +3,8 @@
3
3
  class Treat::Processors::Segmenters::Stanford
4
4
 
5
5
  require 'treat/loaders/stanford'
6
-
6
+ Treat::Loaders::Stanford.load
7
+
7
8
  DefaultOptions = {
8
9
  :also_tokenize => false
9
10
  }
@@ -3,7 +3,8 @@
3
3
  class Treat::Processors::Tokenizers::Stanford
4
4
 
5
5
  require 'treat/loaders/stanford'
6
-
6
+ Treat::Loaders::Stanford.load
7
+
7
8
  @@tokenizer = nil
8
9
 
9
10
  # Tokenize the entity using a Penn-Treebank
@@ -0,0 +1,26 @@
1
+ class Treat::Server
2
+
3
+ require 'thin'
4
+
5
+ def self.start
6
+ app = proc do |env|
7
+ #!/usr/bin/env ruby -w
8
+ # simple_service.rb
9
+ # A simple DRb service
10
+
11
+ # load DRb
12
+ require 'drb'
13
+
14
+ # start up the DRb service
15
+ DRb.start_service nil, []
16
+
17
+ # We need the uri of the service to connect a client
18
+ puts DRb.uri
19
+
20
+ # wait for the DRb service to finish before exiting
21
+ DRb.thread.join
22
+ end
23
+ run app
24
+ end
25
+
26
+ end
data/lib/treat/tree.rb CHANGED
@@ -175,6 +175,8 @@ module Treat::Tree
175
175
 
176
176
  # Return a feature.
177
177
  def get(feature)
178
+ return @value if feature == :value
179
+ return @id if feature == :id
178
180
  @features[feature]
179
181
  end
180
182
 
data/lib/treat.rb CHANGED
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.2"
13
+ VERSION = "1.0.3"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
@@ -49,7 +49,7 @@ module Treat
49
49
  require 'treat/categories'
50
50
  require 'treat/data_set'
51
51
  require 'treat/proxies'
52
-
52
+
53
53
  # Install packages for a given language.
54
54
  def self.install(language = :english)
55
55
  require 'treat/installer'
data/spec/sandbox.rb CHANGED
@@ -1,24 +0,0 @@
1
- require_relative '../lib/treat'
2
-
3
- s = Sentence "Barack Obama was killed last night."
4
- s.tokenize
5
-
6
- puts s.word_with_position(2).inspect
7
-
8
- s.word_with_position(2).set :highlighted, 1
9
-
10
- cl = Treat::Classification.new(
11
- :word,
12
- [[:position, 0]],
13
- :highlighted,
14
- 0
15
- )
16
-
17
- data_set = s.export(cl)
18
-
19
- s2 = Sentence ''
20
- w = Word 'Hello'
21
- s2 << w
22
- w.set :position, 2
23
-
24
- puts w.classify(:mlp, :training => data_set).inspect
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-14 00:00:00.000000000 Z
12
+ date: 2012-04-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip
@@ -200,6 +200,7 @@ files:
200
200
  - lib/treat/retrievers/indexers/ferret.rb
201
201
  - lib/treat/retrievers/searchers/ferret.rb
202
202
  - lib/treat/retrievers.rb
203
+ - lib/treat/server.rb
203
204
  - lib/treat/tree.rb
204
205
  - lib/treat.rb
205
206
  - spec/collection.rb
@@ -224,7 +225,9 @@ files:
224
225
  - LICENSE
225
226
  homepage: https://github.com/louismullie/treat
226
227
  licenses: []
227
- post_install_message:
228
+ post_install_message: ! "********************************************************************************\n\nThank
229
+ you for installing Treat!\n\nComplete the installation by running:\n\n require
230
+ 'treat'\n Treat.install\n\ninside IRB or a Ruby script.\n\n********************************************************************************\n\n"
228
231
  rdoc_options: []
229
232
  require_paths:
230
233
  - lib