treat 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/treat/classification.rb +1 -1
- data/lib/treat/dependencies.rb +4 -4
- data/lib/treat/entities/abilities/buildable.rb +4 -3
- data/lib/treat/entities/abilities/iterable.rb +1 -1
- data/lib/treat/entities/abilities/stringable.rb +1 -1
- data/lib/treat/entities/entities.rb +1 -1
- data/lib/treat/extractors/name_tag/stanford.rb +5 -3
- data/lib/treat/formatters/readers/xml.rb +1 -0
- data/lib/treat/formatters/serializers/mongo.rb +1 -1
- data/lib/treat/lexicalizers/taggers/stanford.rb +3 -3
- data/lib/treat/loaders/linguistics.rb +23 -19
- data/lib/treat/loaders/stanford.rb +40 -22
- data/lib/treat/processors/parsers/stanford.rb +5 -1
- data/lib/treat/processors/segmenters/stanford.rb +2 -1
- data/lib/treat/processors/tokenizers/stanford.rb +2 -1
- data/lib/treat/server.rb +26 -0
- data/lib/treat/tree.rb +2 -0
- data/lib/treat.rb +2 -2
- data/spec/sandbox.rb +0 -24
- metadata +6 -3
data/lib/treat/classification.rb
CHANGED
data/lib/treat/dependencies.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
class Treat::Dependencies
|
2
2
|
|
3
3
|
Gem = [
|
4
|
-
['psych', '1.2.2', '(un)serialize annotated entities to YAML format'],
|
5
|
-
['nokogiri', '>= 1.
|
4
|
+
['psych', '>= 1.2.2', '(un)serialize annotated entities to YAML format'],
|
5
|
+
['nokogiri', '>= 1.5.2', 'read and parse XML and HTML formats'],
|
6
6
|
['sdsykes-ferret', '>= 0.11.6.19', 'perform full-text search in collections'],
|
7
7
|
['lda-ruby', '>= 0.3.8', 'extract topic words from documents and collections'],
|
8
|
-
['ruby-readability', '>= 0.5.
|
9
|
-
['stanford-core-nlp', '>= 0.
|
8
|
+
['ruby-readability', '>= 0.5.2', 'extract the readable content from HTML pages'],
|
9
|
+
['stanford-core-nlp', '>= 0.3.0', 'tokenize, segment, parse texts and perform named entity recognition'],
|
10
10
|
['whatlanguage', '>= 1.0.0', 'detect the language of text'],
|
11
11
|
['linguistics', '>= 1.0.9', 'retrieve the inflection of nouns, verbs and numbers in English'],
|
12
12
|
['punkt-segmenter', '>= 0.9.1', 'segment texts into sentences'],
|
@@ -13,7 +13,8 @@ module Treat::Entities::Abilities::Buildable
|
|
13
13
|
PunctRegexp = /^[[:punct:]\$]+$/
|
14
14
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
15
15
|
EmailRegexp = /.+\@.+\..+/
|
16
|
-
|
16
|
+
Enclitics = %w['ll 'm 're 's 't 've]
|
17
|
+
|
17
18
|
# Reserved folder names
|
18
19
|
Reserved = ['.index']
|
19
20
|
|
@@ -264,8 +265,8 @@ module Treat::Entities::Abilities::Buildable
|
|
264
265
|
def token_from_string(string)
|
265
266
|
|
266
267
|
check_encoding(string)
|
267
|
-
if string
|
268
|
-
Treat::Entities::
|
268
|
+
if Enclitics.include?(string.downcase)
|
269
|
+
Treat::Entities::Enclitic.new(string)
|
269
270
|
elsif string =~ WordRegexp &&
|
270
271
|
string.count(' ') == 0 &&
|
271
272
|
string != '-'
|
@@ -69,7 +69,7 @@ module Treat::Entities::Abilities::Stringable
|
|
69
69
|
|
70
70
|
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
71
71
|
if child.is_a?(Treat::Entities::Punctuation) ||
|
72
|
-
child.is_a?(Treat::Entities::
|
72
|
+
child.is_a?(Treat::Entities::Enclitic)
|
73
73
|
value.strip!
|
74
74
|
end
|
75
75
|
value += child.to_s + ' '
|
@@ -4,7 +4,8 @@
|
|
4
4
|
class Treat::Extractors::NameTag::Stanford
|
5
5
|
|
6
6
|
require 'treat/loaders/stanford'
|
7
|
-
|
7
|
+
Treat::Loaders::Stanford.load
|
8
|
+
|
8
9
|
@@classifiers = {}
|
9
10
|
|
10
11
|
def self.name_tag(entity, options = {})
|
@@ -12,9 +13,10 @@ class Treat::Extractors::NameTag::Stanford
|
|
12
13
|
pp = nil
|
13
14
|
|
14
15
|
lang = entity.language
|
15
|
-
|
16
|
+
|
16
17
|
language = Treat::Languages.describe(lang)
|
17
|
-
|
18
|
+
Treat::Loaders::Stanford.load(language)
|
19
|
+
|
18
20
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
19
21
|
tokens = isolated_token ? [entity] : entity.tokens
|
20
22
|
|
@@ -22,6 +22,7 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
22
22
|
|
23
23
|
# Handle options and initialize the tagger.
|
24
24
|
lang = entity.language
|
25
|
+
|
25
26
|
options = get_options(options, lang)
|
26
27
|
tokens, list = get_token_list(entity)
|
27
28
|
init_tagger(lang)
|
@@ -55,14 +56,13 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
55
56
|
|
56
57
|
# Initialize the tagger for a language.
|
57
58
|
def self.init_tagger(lang)
|
58
|
-
|
59
59
|
language = Treat::Languages.describe(lang)
|
60
|
+
Treat::Loaders::Stanford.load(language)
|
60
61
|
model = StanfordCoreNLP::Config::Models[:pos][language]
|
61
|
-
model = Treat.
|
62
|
+
model = Treat::Loaders::Stanford.model_path +
|
62
63
|
StanfordCoreNLP::Config::ModelFolders[:pos] + model
|
63
64
|
@@taggers[lang] ||=
|
64
65
|
StanfordCoreNLP::MaxentTagger.new(model)
|
65
|
-
|
66
66
|
end
|
67
67
|
|
68
68
|
# Handle the options for the tagger.
|
@@ -1,26 +1,30 @@
|
|
1
|
-
|
1
|
+
module Treat
|
2
2
|
|
3
|
-
|
4
|
-
# registered with the Linguistics gem.
|
5
|
-
class Linguistics
|
3
|
+
module Loaders
|
6
4
|
|
7
|
-
|
8
|
-
|
5
|
+
# A helper class to load a language class
|
6
|
+
# registered with the Linguistics gem.
|
7
|
+
class Linguistics
|
9
8
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
silence_warnings do
|
17
|
-
@@languages[language] =
|
18
|
-
::Linguistics.const_get(l)
|
9
|
+
silence_warnings { require 'linguistics' }
|
10
|
+
@@languages = {}
|
11
|
+
|
12
|
+
def self.load(language)
|
13
|
+
if @@languages[language]
|
14
|
+
return @@languages[language]
|
19
15
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
begin
|
17
|
+
l = language.to_s.upcase
|
18
|
+
silence_warnings do
|
19
|
+
@@languages[language] =
|
20
|
+
::Linguistics.const_get(l)
|
21
|
+
end
|
22
|
+
rescue RuntimeError
|
23
|
+
raise "Ruby Linguistics does " +
|
24
|
+
"not have a module installed " +
|
25
|
+
"for the #{language} language."
|
26
|
+
end
|
27
|
+
|
24
28
|
end
|
25
29
|
|
26
30
|
end
|
@@ -1,27 +1,45 @@
|
|
1
|
-
|
1
|
+
module Treat
|
2
|
+
|
3
|
+
module Loaders
|
2
4
|
|
3
|
-
|
4
|
-
# registered with the Linguistics gem.
|
5
|
-
class Stanford
|
5
|
+
class Stanford
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
StanfordCoreNLP.jar_path =
|
10
|
-
Treat.bin + 'stanford/'
|
11
|
-
|
12
|
-
StanfordCoreNLP.model_path =
|
13
|
-
Treat.models + 'stanford/'
|
14
|
-
|
15
|
-
StanfordCoreNLP.use(
|
16
|
-
Treat::Languages.describe(
|
17
|
-
Treat.default_language))
|
7
|
+
require 'stanford-core-nlp'
|
18
8
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
9
|
+
class << self
|
10
|
+
attr_accessor :jar_path
|
11
|
+
attr_accessor :model_path
|
12
|
+
attr_accessor :loaded
|
13
|
+
end
|
14
|
+
|
15
|
+
self.jar_path = Treat.bin + 'stanford/'
|
16
|
+
self.model_path = Treat.models + 'stanford/'
|
17
|
+
self.loaded = false
|
18
|
+
|
19
|
+
def self.load(language = nil)
|
20
|
+
|
21
|
+
return if self.loaded
|
22
|
+
|
23
|
+
language ||=
|
24
|
+
Treat::Languages.describe(
|
25
|
+
Treat.default_language)
|
26
|
+
|
27
|
+
StanfordCoreNLP.jar_path = self.jar_path
|
28
|
+
StanfordCoreNLP.model_path = self.model_path
|
29
|
+
|
30
|
+
StanfordCoreNLP.use(language)
|
31
|
+
|
32
|
+
StanfordCoreNLP.log_file =
|
33
|
+
NULL_DEVICE if Treat.silence
|
34
|
+
|
35
|
+
StanfordCoreNLP.bind
|
36
|
+
|
37
|
+
self.loaded = true
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
24
42
|
|
25
43
|
end
|
26
|
-
|
27
|
-
end
|
44
|
+
|
45
|
+
end
|
@@ -26,7 +26,7 @@ class Treat::Processors::Parsers::Stanford
|
|
26
26
|
val = entity.to_s
|
27
27
|
lang = entity.language
|
28
28
|
init(lang, options)
|
29
|
-
|
29
|
+
|
30
30
|
text = ::StanfordCoreNLP::Text.new(val)
|
31
31
|
@@parsers[lang].annotate(text)
|
32
32
|
|
@@ -52,6 +52,10 @@ class Treat::Processors::Parsers::Stanford
|
|
52
52
|
|
53
53
|
def self.init(lang, options)
|
54
54
|
return if @@parsers[lang]
|
55
|
+
|
56
|
+
language = Treat::Languages.describe(lang)
|
57
|
+
Treat::Loaders::Stanford.load(language)
|
58
|
+
|
55
59
|
options = DefaultOptions.merge(options)
|
56
60
|
StanfordCoreNLP.use(lang)
|
57
61
|
if options[:tagger_model]
|
data/lib/treat/server.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
class Treat::Server
|
2
|
+
|
3
|
+
require 'thin'
|
4
|
+
|
5
|
+
def self.start
|
6
|
+
app = proc do |env|
|
7
|
+
#!/usr/bin/env ruby -w
|
8
|
+
# simple_service.rb
|
9
|
+
# A simple DRb service
|
10
|
+
|
11
|
+
# load DRb
|
12
|
+
require 'drb'
|
13
|
+
|
14
|
+
# start up the DRb service
|
15
|
+
DRb.start_service nil, []
|
16
|
+
|
17
|
+
# We need the uri of the service to connect a client
|
18
|
+
puts DRb.uri
|
19
|
+
|
20
|
+
# wait for the DRb service to finish before exiting
|
21
|
+
DRb.thread.join
|
22
|
+
end
|
23
|
+
run app
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/treat/tree.rb
CHANGED
data/lib/treat.rb
CHANGED
@@ -10,7 +10,7 @@ module Treat
|
|
10
10
|
end
|
11
11
|
|
12
12
|
# The current version of Treat.
|
13
|
-
VERSION = "1.0.
|
13
|
+
VERSION = "1.0.3"
|
14
14
|
|
15
15
|
# Add methods to handle syntactic sugar,
|
16
16
|
# language configuration options, and paths.
|
@@ -49,7 +49,7 @@ module Treat
|
|
49
49
|
require 'treat/categories'
|
50
50
|
require 'treat/data_set'
|
51
51
|
require 'treat/proxies'
|
52
|
-
|
52
|
+
|
53
53
|
# Install packages for a given language.
|
54
54
|
def self.install(language = :english)
|
55
55
|
require 'treat/installer'
|
data/spec/sandbox.rb
CHANGED
@@ -1,24 +0,0 @@
|
|
1
|
-
require_relative '../lib/treat'
|
2
|
-
|
3
|
-
s = Sentence "Barack Obama was killed last night."
|
4
|
-
s.tokenize
|
5
|
-
|
6
|
-
puts s.word_with_position(2).inspect
|
7
|
-
|
8
|
-
s.word_with_position(2).set :highlighted, 1
|
9
|
-
|
10
|
-
cl = Treat::Classification.new(
|
11
|
-
:word,
|
12
|
-
[[:position, 0]],
|
13
|
-
:highlighted,
|
14
|
-
0
|
15
|
-
)
|
16
|
-
|
17
|
-
data_set = s.export(cl)
|
18
|
-
|
19
|
-
s2 = Sentence ''
|
20
|
-
w = Word 'Hello'
|
21
|
-
s2 << w
|
22
|
-
w.set :position, 2
|
23
|
-
|
24
|
-
puts w.classify(:mlp, :training => data_set).inspect
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|
@@ -200,6 +200,7 @@ files:
|
|
200
200
|
- lib/treat/retrievers/indexers/ferret.rb
|
201
201
|
- lib/treat/retrievers/searchers/ferret.rb
|
202
202
|
- lib/treat/retrievers.rb
|
203
|
+
- lib/treat/server.rb
|
203
204
|
- lib/treat/tree.rb
|
204
205
|
- lib/treat.rb
|
205
206
|
- spec/collection.rb
|
@@ -224,7 +225,9 @@ files:
|
|
224
225
|
- LICENSE
|
225
226
|
homepage: https://github.com/louismullie/treat
|
226
227
|
licenses: []
|
227
|
-
post_install_message:
|
228
|
+
post_install_message: ! "********************************************************************************\n\nThank
|
229
|
+
you for installing Treat!\n\nComplete the installation by running:\n\n require
|
230
|
+
'treat'\n Treat.install\n\ninside IRB or a Ruby script.\n\n********************************************************************************\n\n"
|
228
231
|
rdoc_options: []
|
229
232
|
require_paths:
|
230
233
|
- lib
|