treat 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/treat/classification.rb +1 -1
- data/lib/treat/dependencies.rb +4 -4
- data/lib/treat/entities/abilities/buildable.rb +4 -3
- data/lib/treat/entities/abilities/iterable.rb +1 -1
- data/lib/treat/entities/abilities/stringable.rb +1 -1
- data/lib/treat/entities/entities.rb +1 -1
- data/lib/treat/extractors/name_tag/stanford.rb +5 -3
- data/lib/treat/formatters/readers/xml.rb +1 -0
- data/lib/treat/formatters/serializers/mongo.rb +1 -1
- data/lib/treat/lexicalizers/taggers/stanford.rb +3 -3
- data/lib/treat/loaders/linguistics.rb +23 -19
- data/lib/treat/loaders/stanford.rb +40 -22
- data/lib/treat/processors/parsers/stanford.rb +5 -1
- data/lib/treat/processors/segmenters/stanford.rb +2 -1
- data/lib/treat/processors/tokenizers/stanford.rb +2 -1
- data/lib/treat/server.rb +26 -0
- data/lib/treat/tree.rb +2 -0
- data/lib/treat.rb +2 -2
- data/spec/sandbox.rb +0 -24
- metadata +6 -3
data/lib/treat/classification.rb
CHANGED
data/lib/treat/dependencies.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
class Treat::Dependencies
|
2
2
|
|
3
3
|
Gem = [
|
4
|
-
['psych', '1.2.2', '(un)serialize annotated entities to YAML format'],
|
5
|
-
['nokogiri', '>= 1.
|
4
|
+
['psych', '>= 1.2.2', '(un)serialize annotated entities to YAML format'],
|
5
|
+
['nokogiri', '>= 1.5.2', 'read and parse XML and HTML formats'],
|
6
6
|
['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
|
7
7
|
['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
|
8
|
-
['ruby-readability', '>= 0.5.
|
9
|
-
['stanford-core-nlp', '>= 0.
|
8
|
+
['ruby-readability', '>= 0.5.2', 'extract the readable content from HTML pages'],
|
9
|
+
['stanford-core-nlp', '>= 0.3.0', 'tokenize, segment, parse texts and perform named entity recognition'],
|
10
10
|
['whatlanguage', '>= 1.0.0', 'detect the language of text'],
|
11
11
|
['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
|
12
12
|
['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
|
@@ -13,7 +13,8 @@ module Treat::Entities::Abilities::Buildable
|
|
13
13
|
PunctRegexp = /^[[:punct:]\$]+$/
|
14
14
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
15
15
|
EmailRegexp = /.+\@.+\..+/
|
16
|
-
|
16
|
+
Enclitics = %w['ll 'm 're 's 't 've]
|
17
|
+
|
17
18
|
# Reserved folder names
|
18
19
|
Reserved = ['.index']
|
19
20
|
|
@@ -264,8 +265,8 @@ module Treat::Entities::Abilities::Buildable
|
|
264
265
|
def token_from_string(string)
|
265
266
|
|
266
267
|
check_encoding(string)
|
267
|
-
if string
|
268
|
-
Treat::Entities::
|
268
|
+
if Enclitics.include?(string.downcase)
|
269
|
+
Treat::Entities::Enclitic.new(string)
|
269
270
|
elsif string =~ WordRegexp &&
|
270
271
|
string.count(' ') == 0 &&
|
271
272
|
string != '-'
|
@@ -69,7 +69,7 @@ module Treat::Entities::Abilities::Stringable
|
|
69
69
|
|
70
70
|
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
71
71
|
if child.is_a?(Treat::Entities::Punctuation) ||
|
72
|
-
child.is_a?(Treat::Entities::
|
72
|
+
child.is_a?(Treat::Entities::Enclitic)
|
73
73
|
value.strip!
|
74
74
|
end
|
75
75
|
value += child.to_s + ' '
|
@@ -4,7 +4,8 @@
|
|
4
4
|
class Treat::Extractors::NameTag::Stanford
|
5
5
|
|
6
6
|
require 'treat/loaders/stanford'
|
7
|
-
|
7
|
+
Treat::Loaders::Stanford.load
|
8
|
+
|
8
9
|
@@classifiers = {}
|
9
10
|
|
10
11
|
def self.name_tag(entity, options = {})
|
@@ -12,9 +13,10 @@ class Treat::Extractors::NameTag::Stanford
|
|
12
13
|
pp = nil
|
13
14
|
|
14
15
|
lang = entity.language
|
15
|
-
|
16
|
+
|
16
17
|
language = Treat::Languages.describe(lang)
|
17
|
-
|
18
|
+
Treat::Loaders::Stanford.load(language)
|
19
|
+
|
18
20
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
19
21
|
tokens = isolated_token ? [entity] : entity.tokens
|
20
22
|
|
@@ -22,6 +22,7 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
22
22
|
|
23
23
|
# Handle options and initialize the tagger.
|
24
24
|
lang = entity.language
|
25
|
+
|
25
26
|
options = get_options(options, lang)
|
26
27
|
tokens, list = get_token_list(entity)
|
27
28
|
init_tagger(lang)
|
@@ -55,14 +56,13 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
55
56
|
|
56
57
|
# Initialize the tagger for a language.
|
57
58
|
def self.init_tagger(lang)
|
58
|
-
|
59
59
|
language = Treat::Languages.describe(lang)
|
60
|
+
Treat::Loaders::Stanford.load(language)
|
60
61
|
model = StanfordCoreNLP::Config::Models[:pos][language]
|
61
|
-
model = Treat.
|
62
|
+
model = Treat::Loaders::Stanford.model_path +
|
62
63
|
StanfordCoreNLP::Config::ModelFolders[:pos] + model
|
63
64
|
@@taggers[lang] ||=
|
64
65
|
StanfordCoreNLP::MaxentTagger.new(model)
|
65
|
-
|
66
66
|
end
|
67
67
|
|
68
68
|
# Handle the options for the tagger.
|
@@ -1,26 +1,30 @@
|
|
1
|
-
|
1
|
+
module Treat
|
2
2
|
|
3
|
-
|
4
|
-
# registered with the Linguistics gem.
|
5
|
-
class Linguistics
|
3
|
+
module Loaders
|
6
4
|
|
7
|
-
|
8
|
-
|
5
|
+
# A helper class to load a language class
|
6
|
+
# registered with the Linguistics gem.
|
7
|
+
class Linguistics
|
9
8
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
silence_warnings do
|
17
|
-
@@languages[language] =
|
18
|
-
::Linguistics.const_get(l)
|
9
|
+
silence_warnings { require 'linguistics' }
|
10
|
+
@@languages = {}
|
11
|
+
|
12
|
+
def self.load(language)
|
13
|
+
if @@languages[language]
|
14
|
+
return @@languages[language]
|
19
15
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
begin
|
17
|
+
l = language.to_s.upcase
|
18
|
+
silence_warnings do
|
19
|
+
@@languages[language] =
|
20
|
+
::Linguistics.const_get(l)
|
21
|
+
end
|
22
|
+
rescue RuntimeError
|
23
|
+
raise "Ruby Linguistics does " +
|
24
|
+
"not have a module installed " +
|
25
|
+
"for the #{language} language."
|
26
|
+
end
|
27
|
+
|
24
28
|
end
|
25
29
|
|
26
30
|
end
|
@@ -1,27 +1,45 @@
|
|
1
|
-
|
1
|
+
module Treat
|
2
|
+
|
3
|
+
module Loaders
|
2
4
|
|
3
|
-
|
4
|
-
# registered with the Linguistics gem.
|
5
|
-
class Stanford
|
5
|
+
class Stanford
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
StanfordCoreNLP.jar_path =
|
10
|
-
Treat.bin + 'stanford/'
|
11
|
-
|
12
|
-
StanfordCoreNLP.model_path =
|
13
|
-
Treat.models + 'stanford/'
|
14
|
-
|
15
|
-
StanfordCoreNLP.use(
|
16
|
-
Treat::Languages.describe(
|
17
|
-
Treat.default_language))
|
7
|
+
require 'stanford-core-nlp'
|
18
8
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
9
|
+
class << self
|
10
|
+
attr_accessor :jar_path
|
11
|
+
attr_accessor :model_path
|
12
|
+
attr_accessor :loaded
|
13
|
+
end
|
14
|
+
|
15
|
+
self.jar_path = Treat.bin + 'stanford/'
|
16
|
+
self.model_path = Treat.models + 'stanford/'
|
17
|
+
self.loaded = false
|
18
|
+
|
19
|
+
def self.load(language = nil)
|
20
|
+
|
21
|
+
return if self.loaded
|
22
|
+
|
23
|
+
language ||=
|
24
|
+
Treat::Languages.describe(
|
25
|
+
Treat.default_language)
|
26
|
+
|
27
|
+
StanfordCoreNLP.jar_path = self.jar_path
|
28
|
+
StanfordCoreNLP.model_path = self.model_path
|
29
|
+
|
30
|
+
StanfordCoreNLP.use(language)
|
31
|
+
|
32
|
+
StanfordCoreNLP.log_file =
|
33
|
+
NULL_DEVICE if Treat.silence
|
34
|
+
|
35
|
+
StanfordCoreNLP.bind
|
36
|
+
|
37
|
+
self.loaded = true
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
24
42
|
|
25
43
|
end
|
26
|
-
|
27
|
-
end
|
44
|
+
|
45
|
+
end
|
@@ -26,7 +26,7 @@ class Treat::Processors::Parsers::Stanford
|
|
26
26
|
val = entity.to_s
|
27
27
|
lang = entity.language
|
28
28
|
init(lang, options)
|
29
|
-
|
29
|
+
|
30
30
|
text = ::StanfordCoreNLP::Text.new(val)
|
31
31
|
@@parsers[lang].annotate(text)
|
32
32
|
|
@@ -52,6 +52,10 @@ class Treat::Processors::Parsers::Stanford
|
|
52
52
|
|
53
53
|
def self.init(lang, options)
|
54
54
|
return if @@parsers[lang]
|
55
|
+
|
56
|
+
language = Treat::Languages.describe(lang)
|
57
|
+
Treat::Loaders::Stanford.load(language)
|
58
|
+
|
55
59
|
options = DefaultOptions.merge(options)
|
56
60
|
StanfordCoreNLP.use(lang)
|
57
61
|
if options[:tagger_model]
|
data/lib/treat/server.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
class Treat::Server
|
2
|
+
|
3
|
+
require 'thin'
|
4
|
+
|
5
|
+
def self.start
|
6
|
+
app = proc do |env|
|
7
|
+
#!/usr/bin/env ruby -w
|
8
|
+
# simple_service.rb
|
9
|
+
# A simple DRb service
|
10
|
+
|
11
|
+
# load DRb
|
12
|
+
require 'drb'
|
13
|
+
|
14
|
+
# start up the DRb service
|
15
|
+
DRb.start_service nil, []
|
16
|
+
|
17
|
+
# We need the uri of the service to connect a client
|
18
|
+
puts DRb.uri
|
19
|
+
|
20
|
+
# wait for the DRb service to finish before exiting
|
21
|
+
DRb.thread.join
|
22
|
+
end
|
23
|
+
run app
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/treat/tree.rb
CHANGED
data/lib/treat.rb
CHANGED
@@ -10,7 +10,7 @@ module Treat
|
|
10
10
|
end
|
11
11
|
|
12
12
|
# The current version of Treat.
|
13
|
-
VERSION = "1.0.
|
13
|
+
VERSION = "1.0.3"
|
14
14
|
|
15
15
|
# Add methods to handle syntactic sugar,
|
16
16
|
# language configuration options, and paths.
|
@@ -49,7 +49,7 @@ module Treat
|
|
49
49
|
require 'treat/categories'
|
50
50
|
require 'treat/data_set'
|
51
51
|
require 'treat/proxies'
|
52
|
-
|
52
|
+
|
53
53
|
# Install packages for a given language.
|
54
54
|
def self.install(language = :english)
|
55
55
|
require 'treat/installer'
|
data/spec/sandbox.rb
CHANGED
@@ -1,24 +0,0 @@
|
|
1
|
-
require_relative '../lib/treat'
|
2
|
-
|
3
|
-
s = Sentence "Barack Obama was killed last night."
|
4
|
-
s.tokenize
|
5
|
-
|
6
|
-
puts s.word_with_position(2).inspect
|
7
|
-
|
8
|
-
s.word_with_position(2).set :highlighted, 1
|
9
|
-
|
10
|
-
cl = Treat::Classification.new(
|
11
|
-
:word,
|
12
|
-
[[:position, 0]],
|
13
|
-
:highlighted,
|
14
|
-
0
|
15
|
-
)
|
16
|
-
|
17
|
-
data_set = s.export(cl)
|
18
|
-
|
19
|
-
s2 = Sentence ''
|
20
|
-
w = Word 'Hello'
|
21
|
-
s2 << w
|
22
|
-
w.set :position, 2
|
23
|
-
|
24
|
-
puts w.classify(:mlp, :training => data_set).inspect
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|
@@ -200,6 +200,7 @@ files:
|
|
200
200
|
- lib/treat/retrievers/indexers/ferret.rb
|
201
201
|
- lib/treat/retrievers/searchers/ferret.rb
|
202
202
|
- lib/treat/retrievers.rb
|
203
|
+
- lib/treat/server.rb
|
203
204
|
- lib/treat/tree.rb
|
204
205
|
- lib/treat.rb
|
205
206
|
- spec/collection.rb
|
@@ -224,7 +225,9 @@ files:
|
|
224
225
|
- LICENSE
|
225
226
|
homepage: https://github.com/louismullie/treat
|
226
227
|
licenses: []
|
227
|
-
post_install_message:
|
228
|
+
post_install_message: ! "********************************************************************************\n\nThank
|
229
|
+
you for installing Treat!\n\nComplete the installation by running:\n\n require
|
230
|
+
'treat'\n Treat.install\n\ninside IRB or a Ruby script.\n\n********************************************************************************\n\n"
|
228
231
|
rdoc_options: []
|
229
232
|
require_paths:
|
230
233
|
- lib
|