treat 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -60,4 +60,4 @@ class Treat::Classification
60
60
  line
61
61
  end
62
62
 
63
- end
63
+ end
@@ -1,12 +1,12 @@
1
1
  class Treat::Dependencies
2
2
 
3
3
  Gem = [
4
- ['psych', '1.2.2', '(un)serialize annotated entities to YAML format'],
5
- ['nokogiri', '>= 1.4.0', 'read and parse XML and HTML formats'],
4
+ ['psych', '>= 1.2.2', '(un)serialize annotated entities to YAML format'],
5
+ ['nokogiri', '>= 1.5.2', 'read and parse XML and HTML formats'],
6
6
  ['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
7
7
  ['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
8
- ['ruby-readability', '>= 0.5.0', 'extract the readable content from HTML pages'],
9
- ['stanford-core-nlp', '>= 0.1.8', 'tokenize, segment, parse texts and perform named entity recognition'],
8
+ ['ruby-readability', '>= 0.5.2', 'extract the readable content from HTML pages'],
9
+ ['stanford-core-nlp', '>= 0.3.0', 'tokenize, segment, parse texts and perform named entity recognition'],
10
10
  ['whatlanguage', '>= 1.0.0', 'detect the language of text'],
11
11
  ['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
12
12
  ['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
@@ -13,7 +13,8 @@ module Treat::Entities::Abilities::Buildable
13
13
  PunctRegexp = /^[[:punct:]\$]+$/
14
14
  UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
15
15
  EmailRegexp = /.+\@.+\..+/
16
-
16
+ Enclitics = %w['ll 'm 're 's 't 've]
17
+
17
18
  # Reserved folder names
18
19
  Reserved = ['.index']
19
20
 
@@ -264,8 +265,8 @@ module Treat::Entities::Abilities::Buildable
264
265
  def token_from_string(string)
265
266
 
266
267
  check_encoding(string)
267
- if string == "'s" || string == "'S"
268
- Treat::Entities::Clitic.new(string)
268
+ if Enclitics.include?(string.downcase)
269
+ Treat::Entities::Enclitic.new(string)
269
270
  elsif string =~ WordRegexp &&
270
271
  string.count(' ') == 0 &&
271
272
  string != '-'
@@ -30,7 +30,7 @@ module Treat::Entities::Abilities::Iterable
30
30
  a = []
31
31
  type = :entity unless type
32
32
  each_entity(type) do |e|
33
- r = e.send(feature)
33
+ r = e.get(feature)
34
34
  a << e if r == value
35
35
  end
36
36
  a
@@ -69,7 +69,7 @@ module Treat::Entities::Abilities::Stringable
69
69
 
70
70
  if child.is_a?(Treat::Entities::Token) || child.value != ''
71
71
  if child.is_a?(Treat::Entities::Punctuation) ||
72
- child.is_a?(Treat::Entities::Clitic)
72
+ child.is_a?(Treat::Entities::Enclitic)
73
73
  value.strip!
74
74
  end
75
75
  value += child.to_s + ' '
@@ -76,7 +76,7 @@ module Treat::Entities
76
76
  end
77
77
 
78
78
  # Represents a clitic ('s).
79
- class Clitic < Token; end
79
+ class Enclitic < Token; end
80
80
 
81
81
  # Represents a number.
82
82
  class Number < Token
@@ -4,7 +4,8 @@
4
4
  class Treat::Extractors::NameTag::Stanford
5
5
 
6
6
  require 'treat/loaders/stanford'
7
-
7
+ Treat::Loaders::Stanford.load
8
+
8
9
  @@classifiers = {}
9
10
 
10
11
  def self.name_tag(entity, options = {})
@@ -12,9 +13,10 @@ class Treat::Extractors::NameTag::Stanford
12
13
  pp = nil
13
14
 
14
15
  lang = entity.language
15
-
16
+
16
17
  language = Treat::Languages.describe(lang)
17
-
18
+ Treat::Loaders::Stanford.load(language)
19
+
18
20
  isolated_token = entity.is_a?(Treat::Entities::Token)
19
21
  tokens = isolated_token ? [entity] : entity.tokens
20
22
 
@@ -1,6 +1,7 @@
1
1
  class Treat::Formatters::Readers::XML
2
2
 
3
3
  require 'treat/loaders/stanford'
4
+ Treat::Loaders::Stanford.load
4
5
  require 'cgi'
5
6
 
6
7
  # By default, don't backup the XML
@@ -32,7 +32,7 @@ class Treat::Formatters::Serializers::Mongo
32
32
  coll = @@db[type_id[0]][type_id[1]]
33
33
  end
34
34
 
35
- # Store path
35
+ # Store path
36
36
 
37
37
  Treat::Entities.list.each do |type|
38
38
 
@@ -22,6 +22,7 @@ class Treat::Lexicalizers::Taggers::Stanford
22
22
 
23
23
  # Handle options and initialize the tagger.
24
24
  lang = entity.language
25
+
25
26
  options = get_options(options, lang)
26
27
  tokens, list = get_token_list(entity)
27
28
  init_tagger(lang)
@@ -55,14 +56,13 @@ class Treat::Lexicalizers::Taggers::Stanford
55
56
 
56
57
  # Initialize the tagger for a language.
57
58
  def self.init_tagger(lang)
58
-
59
59
  language = Treat::Languages.describe(lang)
60
+ Treat::Loaders::Stanford.load(language)
60
61
  model = StanfordCoreNLP::Config::Models[:pos][language]
61
- model = Treat.models + 'stanford/' +
62
+ model = Treat::Loaders::Stanford.model_path +
62
63
  StanfordCoreNLP::Config::ModelFolders[:pos] + model
63
64
  @@taggers[lang] ||=
64
65
  StanfordCoreNLP::MaxentTagger.new(model)
65
-
66
66
  end
67
67
 
68
68
  # Handle the options for the tagger.
@@ -1,26 +1,30 @@
1
- class Treat::Loaders
1
+ module Treat
2
2
 
3
- # A helper class to load a language class
4
- # registered with the Linguistics gem.
5
- class Linguistics
3
+ module Loaders
6
4
 
7
- silence_warnings { require 'linguistics' }
8
- @@languages = {}
5
+ # A helper class to load a language class
6
+ # registered with the Linguistics gem.
7
+ class Linguistics
9
8
 
10
- def self.load(language)
11
- if @@languages[language]
12
- return @@languages[language]
13
- end
14
- begin
15
- l = language.to_s.upcase
16
- silence_warnings do
17
- @@languages[language] =
18
- ::Linguistics.const_get(l)
9
+ silence_warnings { require 'linguistics' }
10
+ @@languages = {}
11
+
12
+ def self.load(language)
13
+ if @@languages[language]
14
+ return @@languages[language]
19
15
  end
20
- rescue RuntimeError
21
- raise "Ruby Linguistics does " +
22
- "not have a module installed " +
23
- "for the #{language} language."
16
+ begin
17
+ l = language.to_s.upcase
18
+ silence_warnings do
19
+ @@languages[language] =
20
+ ::Linguistics.const_get(l)
21
+ end
22
+ rescue RuntimeError
23
+ raise "Ruby Linguistics does " +
24
+ "not have a module installed " +
25
+ "for the #{language} language."
26
+ end
27
+
24
28
  end
25
29
 
26
30
  end
@@ -1,27 +1,45 @@
1
- class Treat::Loaders
1
+ module Treat
2
+
3
+ module Loaders
2
4
 
3
- # A helper class to load a language class
4
- # registered with the Linguistics gem.
5
- class Stanford
5
+ class Stanford
6
6
 
7
- require 'stanford-core-nlp'
8
-
9
- StanfordCoreNLP.jar_path =
10
- Treat.bin + 'stanford/'
11
-
12
- StanfordCoreNLP.model_path =
13
- Treat.models + 'stanford/'
14
-
15
- StanfordCoreNLP.use(
16
- Treat::Languages.describe(
17
- Treat.default_language))
7
+ require 'stanford-core-nlp'
18
8
 
19
- StanfordCoreNLP.log_file =
20
- NULL_DEVICE if Treat.silence
21
-
22
- StanfordCoreNLP.bind
23
- @@loaded = true
9
+ class << self
10
+ attr_accessor :jar_path
11
+ attr_accessor :model_path
12
+ attr_accessor :loaded
13
+ end
14
+
15
+ self.jar_path = Treat.bin + 'stanford/'
16
+ self.model_path = Treat.models + 'stanford/'
17
+ self.loaded = false
18
+
19
+ def self.load(language = nil)
20
+
21
+ return if self.loaded
22
+
23
+ language ||=
24
+ Treat::Languages.describe(
25
+ Treat.default_language)
26
+
27
+ StanfordCoreNLP.jar_path = self.jar_path
28
+ StanfordCoreNLP.model_path = self.model_path
29
+
30
+ StanfordCoreNLP.use(language)
31
+
32
+ StanfordCoreNLP.log_file =
33
+ NULL_DEVICE if Treat.silence
34
+
35
+ StanfordCoreNLP.bind
36
+
37
+ self.loaded = true
38
+
39
+ end
40
+
41
+ end
24
42
 
25
43
  end
26
-
27
- end
44
+
45
+ end
@@ -26,7 +26,7 @@ class Treat::Processors::Parsers::Stanford
26
26
  val = entity.to_s
27
27
  lang = entity.language
28
28
  init(lang, options)
29
-
29
+
30
30
  text = ::StanfordCoreNLP::Text.new(val)
31
31
  @@parsers[lang].annotate(text)
32
32
 
@@ -52,6 +52,10 @@ class Treat::Processors::Parsers::Stanford
52
52
 
53
53
  def self.init(lang, options)
54
54
  return if @@parsers[lang]
55
+
56
+ language = Treat::Languages.describe(lang)
57
+ Treat::Loaders::Stanford.load(language)
58
+
55
59
  options = DefaultOptions.merge(options)
56
60
  StanfordCoreNLP.use(lang)
57
61
  if options[:tagger_model]
@@ -3,7 +3,8 @@
3
3
  class Treat::Processors::Segmenters::Stanford
4
4
 
5
5
  require 'treat/loaders/stanford'
6
-
6
+ Treat::Loaders::Stanford.load
7
+
7
8
  DefaultOptions = {
8
9
  :also_tokenize => false
9
10
  }
@@ -3,7 +3,8 @@
3
3
  class Treat::Processors::Tokenizers::Stanford
4
4
 
5
5
  require 'treat/loaders/stanford'
6
-
6
+ Treat::Loaders::Stanford.load
7
+
7
8
  @@tokenizer = nil
8
9
 
9
10
  # Tokenize the entity using a Penn-Treebank
@@ -0,0 +1,26 @@
1
+ class Treat::Server
2
+
3
+ require 'thin'
4
+
5
+ def self.start
6
+ app = proc do |env|
7
+ #!/usr/bin/env ruby -w
8
+ # simple_service.rb
9
+ # A simple DRb service
10
+
11
+ # load DRb
12
+ require 'drb'
13
+
14
+ # start up the DRb service
15
+ DRb.start_service nil, []
16
+
17
+ # We need the uri of the service to connect a client
18
+ puts DRb.uri
19
+
20
+ # wait for the DRb service to finish before exiting
21
+ DRb.thread.join
22
+ end
23
+ run app
24
+ end
25
+
26
+ end
data/lib/treat/tree.rb CHANGED
@@ -175,6 +175,8 @@ module Treat::Tree
175
175
 
176
176
  # Return a feature.
177
177
  def get(feature)
178
+ return @value if feature == :value
179
+ return @id if feature == :id
178
180
  @features[feature]
179
181
  end
180
182
 
data/lib/treat.rb CHANGED
@@ -10,7 +10,7 @@ module Treat
10
10
  end
11
11
 
12
12
  # The current version of Treat.
13
- VERSION = "1.0.2"
13
+ VERSION = "1.0.3"
14
14
 
15
15
  # Add methods to handle syntactic sugar,
16
16
  # language configuration options, and paths.
@@ -49,7 +49,7 @@ module Treat
49
49
  require 'treat/categories'
50
50
  require 'treat/data_set'
51
51
  require 'treat/proxies'
52
-
52
+
53
53
  # Install packages for a given language.
54
54
  def self.install(language = :english)
55
55
  require 'treat/installer'
data/spec/sandbox.rb CHANGED
@@ -1,24 +0,0 @@
1
- require_relative '../lib/treat'
2
-
3
- s = Sentence "Barack Obama was killed last night."
4
- s.tokenize
5
-
6
- puts s.word_with_position(2).inspect
7
-
8
- s.word_with_position(2).set :highlighted, 1
9
-
10
- cl = Treat::Classification.new(
11
- :word,
12
- [[:position, 0]],
13
- :highlighted,
14
- 0
15
- )
16
-
17
- data_set = s.export(cl)
18
-
19
- s2 = Sentence ''
20
- w = Word 'Hello'
21
- s2 << w
22
- w.set :position, 2
23
-
24
- puts w.classify(:mlp, :training => data_set).inspect
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-14 00:00:00.000000000 Z
12
+ date: 2012-04-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rubyzip
@@ -200,6 +200,7 @@ files:
200
200
  - lib/treat/retrievers/indexers/ferret.rb
201
201
  - lib/treat/retrievers/searchers/ferret.rb
202
202
  - lib/treat/retrievers.rb
203
+ - lib/treat/server.rb
203
204
  - lib/treat/tree.rb
204
205
  - lib/treat.rb
205
206
  - spec/collection.rb
@@ -224,7 +225,9 @@ files:
224
225
  - LICENSE
225
226
  homepage: https://github.com/louismullie/treat
226
227
  licenses: []
227
- post_install_message:
228
+ post_install_message: ! "********************************************************************************\n\nThank
229
+ you for installing Treat!\n\nComplete the installation by running:\n\n require
230
+ 'treat'\n Treat.install\n\ninside IRB or a Ruby script.\n\n********************************************************************************\n\n"
228
231
  rdoc_options: []
229
232
  require_paths:
230
233
  - lib