treat 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/README.md +3 -3
- data/lib/treat/config.rb +10 -0
- data/lib/treat/core/data_set.rb +80 -32
- data/lib/treat/core/feature.rb +35 -0
- data/lib/treat/core/problem.rb +43 -0
- data/lib/treat/core/question.rb +27 -0
- data/lib/treat/entities/abilities/buildable.rb +5 -3
- data/lib/treat/entities/abilities/exportable.rb +4 -4
- data/lib/treat/entities/collection.rb +1 -1
- data/lib/treat/entities/document.rb +1 -1
- data/lib/treat/entities/group.rb +8 -5
- data/lib/treat/entities/section.rb +1 -1
- data/lib/treat/entities/token.rb +20 -8
- data/lib/treat/entities/zone.rb +6 -5
- data/lib/treat/loaders/linguistics.rb +18 -19
- data/lib/treat/loaders/stanford.rb +3 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/language/what_language.rb +53 -57
- data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
- data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
- data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
- data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
- data/lib/treat/workers.rb +1 -1
- data/spec/entity.rb +7 -5
- data/spec/phrase.rb +2 -2
- data/spec/zone.rb +2 -3
- metadata +37 -15
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/21552208.html +0 -683
- data/files/3_2_release_notes.html +0 -766
- data/files/nethttp-cheat-sheet-2940.html +0 -395
- data/files/weather-central-canada-heat-wave.html +0 -1370
- data/lib/treat/core/classification.rb +0 -63
- data/lib/treat/core/server.rb +0 -3
- data/spec/sandbox.rb +0 -223
- data/tmp/english.yaml +0 -10340
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.
|
1
|
+
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.1
|
2
2
|
|
3
3
|
This program is free software: you can redistribute it and/or modify
|
4
4
|
it under the terms of the GNU General Public License as published by
|
data/README.md
CHANGED
@@ -21,9 +21,9 @@ Treat is a framework for natural language processing and computational linguisti
|
|
21
21
|
**Resources**
|
22
22
|
|
23
23
|
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
24
|
-
* See how to [install Treat](https://github.com/louismullie/treat/wiki/
|
25
|
-
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/
|
26
|
-
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing
|
24
|
+
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
|
25
|
+
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
|
26
|
+
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
|
27
27
|
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
28
28
|
* Open an [issue](https://github.com/louismullie/treat/issues).
|
29
29
|
|
data/lib/treat/config.rb
CHANGED
@@ -105,6 +105,16 @@ module Treat::Config
|
|
105
105
|
end
|
106
106
|
end
|
107
107
|
end
|
108
|
+
|
109
|
+
Treat::Core.constants.each do |kname|
|
110
|
+
Object.class_eval do
|
111
|
+
klass = Treat::Core.const_get(kname)
|
112
|
+
define_method(kname) do |*args|
|
113
|
+
klass.new(*args)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
108
118
|
end
|
109
119
|
|
110
120
|
# Turn off syntactic sugar.
|
data/lib/treat/core/data_set.rb
CHANGED
@@ -1,48 +1,96 @@
|
|
1
|
+
# A DataSet contains an entity classification
|
2
|
+
# problem as well as data for entities that
|
3
|
+
# have already been classified, complete with
|
4
|
+
# references to these entities.
|
1
5
|
class Treat::Core::DataSet
|
6
|
+
|
7
|
+
# Used to serialize Procs.
|
8
|
+
silence_warnings do
|
9
|
+
require 'sourcify'
|
10
|
+
end
|
2
11
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
12
|
+
# The classification problem this
|
13
|
+
# data set holds data for.
|
14
|
+
attr_accessor :problem
|
15
|
+
# Items that have been already
|
16
|
+
# classified (training data).
|
17
|
+
attr_accessor :items
|
18
|
+
# References to the IDs of the
|
19
|
+
# original entities contained
|
20
|
+
# in the data set.
|
21
|
+
attr_accessor :entities
|
7
22
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
23
|
+
# Initialize the DataSet. Can be
|
24
|
+
# done with a Problem entity
|
25
|
+
# (thereby creating an empty set)
|
26
|
+
# or with a filename (representing
|
27
|
+
# a serialized data set which will
|
28
|
+
# then be deserialized and loaded).
|
29
|
+
def initialize(prob_or_file)
|
30
|
+
if prob_or_file.is_a?(String)
|
31
|
+
ds = self.class.
|
32
|
+
unserialize(prob_or_file)
|
33
|
+
@problem = ds.problem
|
34
|
+
@items = ds.items
|
35
|
+
@entities = ds.entities
|
36
|
+
else
|
37
|
+
@problem = prob_or_file
|
38
|
+
@items, @entities = [], []
|
14
39
|
end
|
15
|
-
::Psych.load(
|
16
|
-
File.read(file))
|
17
|
-
end
|
18
|
-
|
19
|
-
def initialize(classification)
|
20
|
-
@classification = classification
|
21
|
-
@labels = classification.labels
|
22
|
-
@items = []
|
23
|
-
@ids = []
|
24
40
|
end
|
25
41
|
|
42
|
+
# Add an entity to the data set.
|
43
|
+
# The entity's relevant features
|
44
|
+
# are calculated based on the
|
45
|
+
# classification problem, and a
|
46
|
+
# line with the results of the
|
47
|
+
# calculation is added to the
|
48
|
+
# data set, along with the ID
|
49
|
+
# of the entity.
|
26
50
|
def <<(entity)
|
27
|
-
@items <<
|
28
|
-
@classification.
|
51
|
+
@items << @problem.
|
29
52
|
export_item(entity)
|
30
|
-
@
|
53
|
+
@entities << entity.id
|
31
54
|
end
|
32
55
|
|
33
|
-
|
34
|
-
|
35
|
-
|
56
|
+
# Marshal the data set to the supplied
|
57
|
+
# file name. Marshal is used for speed;
|
58
|
+
# other serialization options may be
|
59
|
+
# provided in later versions. This
|
60
|
+
# method relies on the sourcify gem
|
61
|
+
# to transform Feature procs to strings,
|
62
|
+
# since procs/lambdas can't be serialized.
|
63
|
+
def serialize(file)
|
64
|
+
problem = @problem.dup
|
65
|
+
problem.features.each do |feature|
|
66
|
+
next unless feature.proc
|
67
|
+
feature.proc = feature.proc.to_source
|
68
|
+
end
|
69
|
+
data = [problem, @items, @entities]
|
70
|
+
File.open(file, 'w') do |f|
|
71
|
+
f.write(Marshal.dump(data))
|
72
|
+
end
|
73
|
+
problem.features.each do |feature|
|
74
|
+
next unless feature.proc
|
75
|
+
source = feature.proc[5..-1]
|
76
|
+
feature.proc = eval("Proc.new #{source}")
|
36
77
|
end
|
37
78
|
end
|
38
79
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
80
|
+
# Unserialize a data set file created
|
81
|
+
# by using the #serialize method.
|
82
|
+
def self.unserialize(file)
|
83
|
+
data = Marshal.load(File.read(file))
|
84
|
+
problem, items, entities = *data
|
85
|
+
problem.features.each do |feature|
|
86
|
+
next unless feature.proc
|
87
|
+
source = feature.proc[5..-1]
|
88
|
+
feature.proc = eval("Proc.new #{source}")
|
89
|
+
end
|
90
|
+
data_set = Treat::Core::DataSet.new(problem)
|
91
|
+
data_set.items = items
|
92
|
+
data_set.entities = entities
|
93
|
+
data_set
|
46
94
|
end
|
47
95
|
|
48
96
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# Represents a feature to be used
|
2
|
+
# in a classification task.
|
3
|
+
class Treat::Core::Feature
|
4
|
+
|
5
|
+
# The name of the feature. If no
|
6
|
+
# proc is supplied, this assumes
|
7
|
+
# that the target of your classification
|
8
|
+
# problem responds to the method
|
9
|
+
# corresponding to this name.
|
10
|
+
attr_accessor :name
|
11
|
+
# A proc that can be used to perform
|
12
|
+
# calculations before storing a feature.
|
13
|
+
attr_accessor :proc
|
14
|
+
# The default value to be
|
15
|
+
attr_accessor :default
|
16
|
+
|
17
|
+
# Initialize a feature for a classification
|
18
|
+
# problem. If two arguments are supplied,
|
19
|
+
# the second argument is assumed to be the
|
20
|
+
# default value. If three arguments are
|
21
|
+
# supplied, the second argument is the
|
22
|
+
# callback to generate the feature, and
|
23
|
+
# the third one is the default value.
|
24
|
+
def initialize(name, proc_or_default = nil, default = nil)
|
25
|
+
@name = name
|
26
|
+
if proc_or_default.is_a?(Proc)
|
27
|
+
@proc, @default =
|
28
|
+
proc_or_default, default
|
29
|
+
else
|
30
|
+
@proc = nil
|
31
|
+
@default = proc_or_default
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# Defines a classification problem.
|
2
|
+
# - What question are we trying to answer?
|
3
|
+
# - What features are we going to look at
|
4
|
+
# to attempt to answer that question?
|
5
|
+
class Treat::Core::Problem
|
6
|
+
|
7
|
+
# The question we are trying to answer.
|
8
|
+
attr_accessor :question
|
9
|
+
# An array of features that will be
|
10
|
+
# looked at in trying to answer the
|
11
|
+
# problem's question.
|
12
|
+
attr_accessor :features
|
13
|
+
# Just the labels from the features.
|
14
|
+
attr_accessor :labels
|
15
|
+
|
16
|
+
# Initialize the problem with a question
|
17
|
+
# and an arbitrary number of features.
|
18
|
+
def initialize(question, *features)
|
19
|
+
@question = question
|
20
|
+
@features = features
|
21
|
+
@labels = @features.map { |f| f.name }
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return an array of all the entity's
|
25
|
+
# features, as defined by the problem.
|
26
|
+
# If include_answer is set to true, will
|
27
|
+
# append the answer to the problem after
|
28
|
+
# all of the features.
|
29
|
+
def export_item(e, include_answer = true)
|
30
|
+
line = []
|
31
|
+
@features.each do |feature|
|
32
|
+
r = feature.proc ?
|
33
|
+
feature.proc.call(e) :
|
34
|
+
e.send(feature.name)
|
35
|
+
line << (r || feature.default)
|
36
|
+
end
|
37
|
+
return line unless include_answer
|
38
|
+
line << (e.has?(@question.name) ?
|
39
|
+
e.get(@question.name) : @question.default)
|
40
|
+
line
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Defines a question to answer in the
|
2
|
+
# context of a classification problem.
|
3
|
+
class Treat::Core::Question
|
4
|
+
|
5
|
+
# Defines an arbitrary label for the
|
6
|
+
# question we are trying to answer
|
7
|
+
# (e.g. is_key_sentence), which will
|
8
|
+
# also be used as the annotation name
|
9
|
+
# for the answer to the question.
|
10
|
+
attr_accessor :name
|
11
|
+
# Can be :continuous or :discrete,
|
12
|
+
# depending on the features used.
|
13
|
+
attr_accessor :type
|
14
|
+
# Defines the target of the question
|
15
|
+
# (e.g. :sentence, :paragraph, etc.)
|
16
|
+
attr_accessor :target
|
17
|
+
# Default for the answer to the question.
|
18
|
+
attr_accessor :default
|
19
|
+
|
20
|
+
# Initialize the question.
|
21
|
+
def initialize(name, target,
|
22
|
+
type = :continuous, default = nil)
|
23
|
+
@name, @target = name, target
|
24
|
+
@type, @default = type, default
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# from a folder of files, a specific file,
|
3
3
|
# a string or a numeric object. This class
|
4
4
|
# is pretty much self-explanatory.
|
5
|
+
# FIXME how can we make this language independent?
|
5
6
|
module Treat::Entities::Abilities::Buildable
|
6
7
|
|
7
8
|
require 'schiphol'
|
@@ -162,8 +163,10 @@ module Treat::Entities::Abilities::Buildable
|
|
162
163
|
# Build a document from a raw or serialized file.
|
163
164
|
def from_file(file, options)
|
164
165
|
|
165
|
-
|
166
|
-
|
166
|
+
if file.index('yml') ||
|
167
|
+
file.index('yaml') ||
|
168
|
+
file.index('xml') ||
|
169
|
+
file.index('mongo')
|
167
170
|
from_serialized_file(file, options)
|
168
171
|
else
|
169
172
|
fmt = Treat::Workers::Formatters::Readers::Autoselect.
|
@@ -221,7 +224,6 @@ module Treat::Entities::Abilities::Buildable
|
|
221
224
|
id = options[:id]
|
222
225
|
e = self.new(nil, id)
|
223
226
|
e.unserialize(adapter, options)
|
224
|
-
e
|
225
227
|
end
|
226
228
|
|
227
229
|
# Build any kind of entity from a string.
|
@@ -1,11 +1,11 @@
|
|
1
1
|
module Treat::Entities::Abilities::Exportable
|
2
2
|
|
3
|
-
def export(
|
4
|
-
ds = Treat::Core::DataSet.new(
|
5
|
-
each_entity(
|
3
|
+
def export(problem)
|
4
|
+
ds = Treat::Core::DataSet.new(problem)
|
5
|
+
each_entity(problem.question.target) do |e|
|
6
6
|
ds << e
|
7
7
|
end
|
8
8
|
ds
|
9
9
|
end
|
10
10
|
|
11
|
-
end
|
11
|
+
end
|
data/lib/treat/entities/group.rb
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
module Treat::Entities
|
2
2
|
|
3
|
-
#
|
4
|
-
class Group <
|
3
|
+
# Represents a group of tokens.
|
4
|
+
class Group < Entity; end
|
5
5
|
|
6
|
-
# Represents a group of words
|
6
|
+
# Represents a group of words
|
7
|
+
# with a sentence ender (.!?)
|
7
8
|
class Sentence < Group; end
|
8
9
|
|
9
|
-
# Represents a group of words
|
10
|
+
# Represents a group of words,
|
11
|
+
# with no sentence ender.
|
10
12
|
class Phrase < Group; end
|
11
13
|
|
12
|
-
# Represents a non-linguistic
|
14
|
+
# Represents a non-linguistic
|
15
|
+
# fragment (e.g. stray symbols).
|
13
16
|
class Fragment < Group; end
|
14
17
|
|
15
18
|
end
|
data/lib/treat/entities/token.rb
CHANGED
@@ -1,31 +1,43 @@
|
|
1
1
|
module Treat::Entities
|
2
|
-
|
3
|
-
|
2
|
+
|
3
|
+
# Represents a terminal element
|
4
|
+
# (leaf) in the text structure.
|
5
|
+
class Token < Entity; end
|
4
6
|
|
5
|
-
# Represents a word.
|
7
|
+
# Represents a word. Strictly,
|
8
|
+
# this is /^[[:alpha:]\-']+$/.
|
6
9
|
class Word < Token; end
|
7
10
|
|
8
|
-
# Represents
|
11
|
+
# Represents an enclitic.
|
12
|
+
# Strictly, this is any of
|
13
|
+
# 'll 'm 're 's 't or 've.
|
9
14
|
class Enclitic < Token; end
|
10
15
|
|
11
|
-
# Represents a number.
|
16
|
+
# Represents a number. Strictly,
|
17
|
+
# this is /^#?([0-9]+)(\.[0-9]+)?$/.
|
12
18
|
class Number < Token
|
13
19
|
def to_i; to_s.to_i; end
|
14
20
|
def to_f; to_s.to_f; end
|
15
21
|
end
|
16
22
|
|
17
23
|
# Represents a punctuation sign.
|
24
|
+
# Strictly, this is /^[[:punct:]\$]+$/.
|
18
25
|
class Punctuation < Token; end
|
19
26
|
|
20
27
|
# Represents a character that is neither
|
21
|
-
#
|
22
|
-
# character (e.g. @#$%&*).
|
28
|
+
# a word, an enclitic, a number or a
|
29
|
+
# punctuation character (e.g. @#$%&*).
|
23
30
|
class Symbol < Token; end
|
24
31
|
|
25
|
-
# Represents a url.
|
32
|
+
# Represents a url. This is (imperfectly)
|
33
|
+
# defined as /^(http|https):\/\/[a-z0-9]
|
34
|
+
# +([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}
|
35
|
+
# (([0-9]{1,5})?\/.*)?$/ix
|
26
36
|
class Url < Token; end
|
27
37
|
|
28
38
|
# Represents a valid RFC822 address.
|
39
|
+
# This is (imperfectly) defined as
|
40
|
+
# /.+\@.+\..+/ (fixme maybe?)
|
29
41
|
class Email < Token; end
|
30
42
|
|
31
43
|
# Represents a token whose type
|
data/lib/treat/entities/zone.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
module Treat::Entities
|
2
|
-
# Represents a zone of text
|
3
|
-
|
4
|
-
class Zone < Treat::Entities::Entity; end
|
2
|
+
# Represents a zone of text.
|
3
|
+
class Zone < Entity; end
|
5
4
|
|
6
|
-
# Represents a title, subtitle,
|
5
|
+
# Represents a title, subtitle,
|
6
|
+
# logical header of a text.
|
7
7
|
class Title < Zone; end
|
8
8
|
|
9
|
-
# Represents a paragraph
|
9
|
+
# Represents a paragraph (group
|
10
|
+
# of sentences and/or phrases).
|
10
11
|
class Paragraph < Zone; end
|
11
12
|
end
|
@@ -2,28 +2,27 @@
|
|
2
2
|
# registered with the Linguistics gem.
|
3
3
|
class Treat::Loaders::Linguistics
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
# Linguistics throws warnings; silence them.
|
6
|
+
silence_warnings { require 'linguistics' }
|
7
|
+
|
8
|
+
# Linguistics classes for each language.
|
9
9
|
@@languages = {}
|
10
10
|
|
11
|
+
# Load the Linguistics class that corresponds
|
12
|
+
# to the supplied language; raises an exception
|
13
|
+
# if there is no such language class registered.
|
11
14
|
def self.load(language)
|
12
|
-
|
13
|
-
|
15
|
+
silence_warnings do
|
16
|
+
@@languages[language] ||=
|
17
|
+
::Linguistics.const_get(
|
18
|
+
language.to_s[0..1].upcase)
|
14
19
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
rescue RuntimeError
|
22
|
-
raise "Ruby Linguistics does " +
|
23
|
-
"not have a module installed " +
|
24
|
-
"for the #{language} language."
|
25
|
-
end
|
26
|
-
|
20
|
+
return @@languages[language]
|
21
|
+
rescue RuntimeError
|
22
|
+
raise Treat::Exception,
|
23
|
+
"Ruby Linguistics does " +
|
24
|
+
"not have a module installed " +
|
25
|
+
"for the #{language} language."
|
27
26
|
end
|
28
27
|
|
29
|
-
end
|
28
|
+
end
|
@@ -1,10 +1,11 @@
|
|
1
|
-
# A helper class to load the
|
2
|
-
# Stanford Core NLP package.
|
1
|
+
# A helper class to load the CoreNLP package.
|
3
2
|
class Treat::Loaders::Stanford
|
4
3
|
|
5
4
|
require 'stanford-core-nlp'
|
5
|
+
|
6
6
|
@@loaded = false
|
7
7
|
|
8
|
+
# Load CoreNLP package for a given language.
|
8
9
|
def self.load(language = nil)
|
9
10
|
return if @@loaded
|
10
11
|
language ||= Treat.core.language.default
|
data/lib/treat/version.rb
CHANGED
@@ -1,61 +1,57 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
if options[:bias_toward].include?(l)
|
51
|
-
return l
|
52
|
-
end
|
1
|
+
# Adaptor for the 'whatlanguage' gem, which
|
2
|
+
# performs probabilistic language detection.
|
3
|
+
# The library works by checking for the presence
|
4
|
+
# of words with bloom filters built from
|
5
|
+
# dictionaries based upon each source language.
|
6
|
+
module Treat::Workers::Extractors::Language::WhatLanguage
|
7
|
+
|
8
|
+
# Require the 'whatlanguage' gem.
|
9
|
+
silence_warnings { require 'whatlanguage' }
|
10
|
+
|
11
|
+
# Undefine the method defined by the gem.
|
12
|
+
String.class_eval { undef :language }
|
13
|
+
|
14
|
+
# By default, bias towards common languages.
|
15
|
+
DefaultOptions = {
|
16
|
+
:bias_toward => [:english, :french, :chinese, :german, :arabic, :spanish]
|
17
|
+
}
|
18
|
+
|
19
|
+
# Keep only once instance of the gem class.
|
20
|
+
@@detector = nil
|
21
|
+
|
22
|
+
# Detect the language of an entity using the
|
23
|
+
# 'whatlanguage' gem. Return an identifier
|
24
|
+
# corresponding to the ISO-639-2 code for the
|
25
|
+
# language.
|
26
|
+
#
|
27
|
+
# Options:
|
28
|
+
#
|
29
|
+
# - (Array of Symbols) bias => Languages to bias
|
30
|
+
# toward when more than one language is detected
|
31
|
+
# with equal probability.
|
32
|
+
def self.language(entity, options = {})
|
33
|
+
|
34
|
+
options = DefaultOptions.merge(options)
|
35
|
+
|
36
|
+
@@detector ||= ::WhatLanguage.new(:possibilities)
|
37
|
+
possibilities = @@detector.process_text(entity.to_s)
|
38
|
+
lang = {}
|
39
|
+
|
40
|
+
possibilities.each do |k,v|
|
41
|
+
lang[k.intern] = v
|
42
|
+
end
|
43
|
+
|
44
|
+
max = lang.values.max
|
45
|
+
ordered = lang.select { |i,j| j == max }.keys
|
46
|
+
|
47
|
+
ordered.each do |l|
|
48
|
+
if options[:bias_toward].include?(l)
|
49
|
+
return l
|
53
50
|
end
|
54
|
-
|
55
|
-
return ordered.first
|
56
|
-
|
57
51
|
end
|
58
|
-
|
52
|
+
|
53
|
+
return ordered.first
|
54
|
+
|
59
55
|
end
|
60
56
|
|
61
|
-
end
|
57
|
+
end
|
@@ -12,24 +12,27 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
|
12
12
|
|
13
13
|
pp = nil
|
14
14
|
|
15
|
-
|
15
|
+
language = entity.language
|
16
16
|
|
17
|
-
Treat::Loaders::Stanford.load(
|
17
|
+
Treat::Loaders::Stanford.load(language)
|
18
18
|
|
19
19
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
20
20
|
tokens = isolated_token ? [entity] : entity.tokens
|
21
21
|
|
22
22
|
ms = StanfordCoreNLP::Config::Models[:ner][language]
|
23
|
-
|
23
|
+
model_path = Treat.libraries.stanford.model_path ||
|
24
|
+
(Treat.paths.models + '/stanford/')
|
25
|
+
ms = model_path + '/' +
|
24
26
|
StanfordCoreNLP::Config::ModelFolders[:ner] +
|
25
27
|
ms['3class']
|
26
28
|
|
27
|
-
@@classifiers[
|
29
|
+
@@classifiers[language] ||=
|
28
30
|
StanfordCoreNLP::CRFClassifier.
|
29
31
|
getClassifier(ms)
|
30
32
|
|
31
33
|
token_list = StanfordCoreNLP.get_list(tokens)
|
32
|
-
sentence = @@classifiers[
|
34
|
+
sentence = @@classifiers[language].
|
35
|
+
classify_sentence(token_list)
|
33
36
|
|
34
37
|
i = 0
|
35
38
|
n = 0
|