treat 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README.md +3 -3
- data/lib/treat/config.rb +10 -0
- data/lib/treat/core/data_set.rb +80 -32
- data/lib/treat/core/feature.rb +35 -0
- data/lib/treat/core/problem.rb +43 -0
- data/lib/treat/core/question.rb +27 -0
- data/lib/treat/entities/abilities/buildable.rb +5 -3
- data/lib/treat/entities/abilities/exportable.rb +4 -4
- data/lib/treat/entities/collection.rb +1 -1
- data/lib/treat/entities/document.rb +1 -1
- data/lib/treat/entities/group.rb +8 -5
- data/lib/treat/entities/section.rb +1 -1
- data/lib/treat/entities/token.rb +20 -8
- data/lib/treat/entities/zone.rb +6 -5
- data/lib/treat/loaders/linguistics.rb +18 -19
- data/lib/treat/loaders/stanford.rb +3 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/language/what_language.rb +53 -57
- data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
- data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
- data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
- data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
- data/lib/treat/workers.rb +1 -1
- data/spec/entity.rb +7 -5
- data/spec/phrase.rb +2 -2
- data/spec/zone.rb +2 -3
- metadata +37 -15
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/21552208.html +0 -683
- data/files/3_2_release_notes.html +0 -766
- data/files/nethttp-cheat-sheet-2940.html +0 -395
- data/files/weather-central-canada-heat-wave.html +0 -1370
- data/lib/treat/core/classification.rb +0 -63
- data/lib/treat/core/server.rb +0 -3
- data/spec/sandbox.rb +0 -223
- data/tmp/english.yaml +0 -10340
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.
|
1
|
+
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.1
|
2
2
|
|
3
3
|
This program is free software: you can redistribute it and/or modify
|
4
4
|
it under the terms of the GNU General Public License as published by
|
data/README.md
CHANGED
@@ -21,9 +21,9 @@ Treat is a framework for natural language processing and computational linguisti
|
|
21
21
|
**Resources**
|
22
22
|
|
23
23
|
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
24
|
-
* See how to [install Treat](https://github.com/louismullie/treat/wiki/
|
25
|
-
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/
|
26
|
-
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing
|
24
|
+
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
|
25
|
+
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
|
26
|
+
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
|
27
27
|
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
28
28
|
* Open an [issue](https://github.com/louismullie/treat/issues).
|
29
29
|
|
data/lib/treat/config.rb
CHANGED
@@ -105,6 +105,16 @@ module Treat::Config
|
|
105
105
|
end
|
106
106
|
end
|
107
107
|
end
|
108
|
+
|
109
|
+
Treat::Core.constants.each do |kname|
|
110
|
+
Object.class_eval do
|
111
|
+
klass = Treat::Core.const_get(kname)
|
112
|
+
define_method(kname) do |*args|
|
113
|
+
klass.new(*args)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
108
118
|
end
|
109
119
|
|
110
120
|
# Turn off syntactic sugar.
|
data/lib/treat/core/data_set.rb
CHANGED
@@ -1,48 +1,96 @@
|
|
1
|
+
# A DataSet contains an entity classification
|
2
|
+
# problem as well as data for entities that
|
3
|
+
# have already been classified, complete with
|
4
|
+
# references to these entities.
|
1
5
|
class Treat::Core::DataSet
|
6
|
+
|
7
|
+
# Used to serialize Procs.
|
8
|
+
silence_warnings do
|
9
|
+
require 'sourcify'
|
10
|
+
end
|
2
11
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
12
|
+
# The classification problem this
|
13
|
+
# data set holds data for.
|
14
|
+
attr_accessor :problem
|
15
|
+
# Items that have been already
|
16
|
+
# classified (training data).
|
17
|
+
attr_accessor :items
|
18
|
+
# References to the IDs of the
|
19
|
+
# original entities contained
|
20
|
+
# in the data set.
|
21
|
+
attr_accessor :entities
|
7
22
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
23
|
+
# Initialize the DataSet. Can be
|
24
|
+
# done with a Problem entity
|
25
|
+
# (thereby creating an empty set)
|
26
|
+
# or with a filename (representing
|
27
|
+
# a serialized data set which will
|
28
|
+
# then be deserialized and loaded).
|
29
|
+
def initialize(prob_or_file)
|
30
|
+
if prob_or_file.is_a?(String)
|
31
|
+
ds = self.class.
|
32
|
+
unserialize(prob_or_file)
|
33
|
+
@problem = ds.problem
|
34
|
+
@items = ds.items
|
35
|
+
@entities = ds.entities
|
36
|
+
else
|
37
|
+
@problem = prob_or_file
|
38
|
+
@items, @entities = [], []
|
14
39
|
end
|
15
|
-
::Psych.load(
|
16
|
-
File.read(file))
|
17
|
-
end
|
18
|
-
|
19
|
-
def initialize(classification)
|
20
|
-
@classification = classification
|
21
|
-
@labels = classification.labels
|
22
|
-
@items = []
|
23
|
-
@ids = []
|
24
40
|
end
|
25
41
|
|
42
|
+
# Add an entity to the data set.
|
43
|
+
# The entity's relevant features
|
44
|
+
# are calculated based on the
|
45
|
+
# classification problem, and a
|
46
|
+
# line with the results of the
|
47
|
+
# calculation is added to the
|
48
|
+
# data set, along with the ID
|
49
|
+
# of the entity.
|
26
50
|
def <<(entity)
|
27
|
-
@items <<
|
28
|
-
@classification.
|
51
|
+
@items << @problem.
|
29
52
|
export_item(entity)
|
30
|
-
@
|
53
|
+
@entities << entity.id
|
31
54
|
end
|
32
55
|
|
33
|
-
|
34
|
-
|
35
|
-
|
56
|
+
# Marshal the data set to the supplied
|
57
|
+
# file name. Marshal is used for speed;
|
58
|
+
# other serialization options may be
|
59
|
+
# provided in later versions. This
|
60
|
+
# method relies on the sourcify gem
|
61
|
+
# to transform Feature procs to strings,
|
62
|
+
# since procs/lambdas can't be serialized.
|
63
|
+
def serialize(file)
|
64
|
+
problem = @problem.dup
|
65
|
+
problem.features.each do |feature|
|
66
|
+
next unless feature.proc
|
67
|
+
feature.proc = feature.proc.to_source
|
68
|
+
end
|
69
|
+
data = [problem, @items, @entities]
|
70
|
+
File.open(file, 'w') do |f|
|
71
|
+
f.write(Marshal.dump(data))
|
72
|
+
end
|
73
|
+
problem.features.each do |feature|
|
74
|
+
next unless feature.proc
|
75
|
+
source = feature.proc[5..-1]
|
76
|
+
feature.proc = eval("Proc.new #{source}")
|
36
77
|
end
|
37
78
|
end
|
38
79
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
80
|
+
# Unserialize a data set file created
|
81
|
+
# by using the #serialize method.
|
82
|
+
def self.unserialize(file)
|
83
|
+
data = Marshal.load(File.read(file))
|
84
|
+
problem, items, entities = *data
|
85
|
+
problem.features.each do |feature|
|
86
|
+
next unless feature.proc
|
87
|
+
source = feature.proc[5..-1]
|
88
|
+
feature.proc = eval("Proc.new #{source}")
|
89
|
+
end
|
90
|
+
data_set = Treat::Core::DataSet.new(problem)
|
91
|
+
data_set.items = items
|
92
|
+
data_set.entities = entities
|
93
|
+
data_set
|
46
94
|
end
|
47
95
|
|
48
96
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# Represents a feature to be used
|
2
|
+
# in a classification task.
|
3
|
+
class Treat::Core::Feature
|
4
|
+
|
5
|
+
# The name of the feature. If no
|
6
|
+
# proc is supplied, this assumes
|
7
|
+
# that the target of your classification
|
8
|
+
# problem responds to the method
|
9
|
+
# corresponding to this name.
|
10
|
+
attr_accessor :name
|
11
|
+
# A proc that can be used to perform
|
12
|
+
# calculations before storing a feature.
|
13
|
+
attr_accessor :proc
|
14
|
+
# The default value to be
|
15
|
+
attr_accessor :default
|
16
|
+
|
17
|
+
# Initialize a feature for a classification
|
18
|
+
# problem. If two arguments are supplied,
|
19
|
+
# the second argument is assumed to be the
|
20
|
+
# default value. If three arguments are
|
21
|
+
# supplied, the second argument is the
|
22
|
+
# callback to generate the feature, and
|
23
|
+
# the third one is the default value.
|
24
|
+
def initialize(name, proc_or_default = nil, default = nil)
|
25
|
+
@name = name
|
26
|
+
if proc_or_default.is_a?(Proc)
|
27
|
+
@proc, @default =
|
28
|
+
proc_or_default, default
|
29
|
+
else
|
30
|
+
@proc = nil
|
31
|
+
@default = proc_or_default
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# Defines a classification problem.
|
2
|
+
# - What question are we trying to answer?
|
3
|
+
# - What features are we going to look at
|
4
|
+
# to attempt to answer that question?
|
5
|
+
class Treat::Core::Problem
|
6
|
+
|
7
|
+
# The question we are trying to answer.
|
8
|
+
attr_accessor :question
|
9
|
+
# An array of features that will be
|
10
|
+
# looked at in trying to answer the
|
11
|
+
# problem's question.
|
12
|
+
attr_accessor :features
|
13
|
+
# Just the labels from the features.
|
14
|
+
attr_accessor :labels
|
15
|
+
|
16
|
+
# Initialize the problem with a question
|
17
|
+
# and an arbitrary number of features.
|
18
|
+
def initialize(question, *features)
|
19
|
+
@question = question
|
20
|
+
@features = features
|
21
|
+
@labels = @features.map { |f| f.name }
|
22
|
+
end
|
23
|
+
|
24
|
+
# Return an array of all the entity's
|
25
|
+
# features, as defined by the problem.
|
26
|
+
# If include_answer is set to true, will
|
27
|
+
# append the answer to the problem after
|
28
|
+
# all of the features.
|
29
|
+
def export_item(e, include_answer = true)
|
30
|
+
line = []
|
31
|
+
@features.each do |feature|
|
32
|
+
r = feature.proc ?
|
33
|
+
feature.proc.call(e) :
|
34
|
+
e.send(feature.name)
|
35
|
+
line << (r || feature.default)
|
36
|
+
end
|
37
|
+
return line unless include_answer
|
38
|
+
line << (e.has?(@question.name) ?
|
39
|
+
e.get(@question.name) : @question.default)
|
40
|
+
line
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Defines a question to answer in the
|
2
|
+
# context of a classification problem.
|
3
|
+
class Treat::Core::Question
|
4
|
+
|
5
|
+
# Defines an arbitrary label for the
|
6
|
+
# question we are trying to answer
|
7
|
+
# (e.g. is_key_sentence), which will
|
8
|
+
# also be used as the annotation name
|
9
|
+
# for the answer to the question.
|
10
|
+
attr_accessor :name
|
11
|
+
# Can be :continuous or :discrete,
|
12
|
+
# depending on the features used.
|
13
|
+
attr_accessor :type
|
14
|
+
# Defines the target of the question
|
15
|
+
# (e.g. :sentence, :paragraph, etc.)
|
16
|
+
attr_accessor :target
|
17
|
+
# Default for the answer to the question.
|
18
|
+
attr_accessor :default
|
19
|
+
|
20
|
+
# Initialize the question.
|
21
|
+
def initialize(name, target,
|
22
|
+
type = :continuous, default = nil)
|
23
|
+
@name, @target = name, target
|
24
|
+
@type, @default = type, default
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# from a folder of files, a specific file,
|
3
3
|
# a string or a numeric object. This class
|
4
4
|
# is pretty much self-explanatory.
|
5
|
+
# FIXME how can we make this language independent?
|
5
6
|
module Treat::Entities::Abilities::Buildable
|
6
7
|
|
7
8
|
require 'schiphol'
|
@@ -162,8 +163,10 @@ module Treat::Entities::Abilities::Buildable
|
|
162
163
|
# Build a document from a raw or serialized file.
|
163
164
|
def from_file(file, options)
|
164
165
|
|
165
|
-
|
166
|
-
|
166
|
+
if file.index('yml') ||
|
167
|
+
file.index('yaml') ||
|
168
|
+
file.index('xml') ||
|
169
|
+
file.index('mongo')
|
167
170
|
from_serialized_file(file, options)
|
168
171
|
else
|
169
172
|
fmt = Treat::Workers::Formatters::Readers::Autoselect.
|
@@ -221,7 +224,6 @@ module Treat::Entities::Abilities::Buildable
|
|
221
224
|
id = options[:id]
|
222
225
|
e = self.new(nil, id)
|
223
226
|
e.unserialize(adapter, options)
|
224
|
-
e
|
225
227
|
end
|
226
228
|
|
227
229
|
# Build any kind of entity from a string.
|
@@ -1,11 +1,11 @@
|
|
1
1
|
module Treat::Entities::Abilities::Exportable
|
2
2
|
|
3
|
-
def export(
|
4
|
-
ds = Treat::Core::DataSet.new(
|
5
|
-
each_entity(
|
3
|
+
def export(problem)
|
4
|
+
ds = Treat::Core::DataSet.new(problem)
|
5
|
+
each_entity(problem.question.target) do |e|
|
6
6
|
ds << e
|
7
7
|
end
|
8
8
|
ds
|
9
9
|
end
|
10
10
|
|
11
|
-
end
|
11
|
+
end
|
data/lib/treat/entities/group.rb
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
module Treat::Entities
|
2
2
|
|
3
|
-
#
|
4
|
-
class Group <
|
3
|
+
# Represents a group of tokens.
|
4
|
+
class Group < Entity; end
|
5
5
|
|
6
|
-
# Represents a group of words
|
6
|
+
# Represents a group of words
|
7
|
+
# with a sentence ender (.!?)
|
7
8
|
class Sentence < Group; end
|
8
9
|
|
9
|
-
# Represents a group of words
|
10
|
+
# Represents a group of words,
|
11
|
+
# with no sentence ender.
|
10
12
|
class Phrase < Group; end
|
11
13
|
|
12
|
-
# Represents a non-linguistic
|
14
|
+
# Represents a non-linguistic
|
15
|
+
# fragment (e.g. stray symbols).
|
13
16
|
class Fragment < Group; end
|
14
17
|
|
15
18
|
end
|
data/lib/treat/entities/token.rb
CHANGED
@@ -1,31 +1,43 @@
|
|
1
1
|
module Treat::Entities
|
2
|
-
|
3
|
-
|
2
|
+
|
3
|
+
# Represents a terminal element
|
4
|
+
# (leaf) in the text structure.
|
5
|
+
class Token < Entity; end
|
4
6
|
|
5
|
-
# Represents a word.
|
7
|
+
# Represents a word. Strictly,
|
8
|
+
# this is /^[[:alpha:]\-']+$/.
|
6
9
|
class Word < Token; end
|
7
10
|
|
8
|
-
# Represents
|
11
|
+
# Represents an enclitic.
|
12
|
+
# Strictly, this is any of
|
13
|
+
# 'll 'm 're 's 't or 've.
|
9
14
|
class Enclitic < Token; end
|
10
15
|
|
11
|
-
# Represents a number.
|
16
|
+
# Represents a number. Strictly,
|
17
|
+
# this is /^#?([0-9]+)(\.[0-9]+)?$/.
|
12
18
|
class Number < Token
|
13
19
|
def to_i; to_s.to_i; end
|
14
20
|
def to_f; to_s.to_f; end
|
15
21
|
end
|
16
22
|
|
17
23
|
# Represents a punctuation sign.
|
24
|
+
# Strictly, this is /^[[:punct:]\$]+$/.
|
18
25
|
class Punctuation < Token; end
|
19
26
|
|
20
27
|
# Represents a character that is neither
|
21
|
-
#
|
22
|
-
# character (e.g. @#$%&*).
|
28
|
+
# a word, an enclitic, a number or a
|
29
|
+
# punctuation character (e.g. @#$%&*).
|
23
30
|
class Symbol < Token; end
|
24
31
|
|
25
|
-
# Represents a url.
|
32
|
+
# Represents a url. This is (imperfectly)
|
33
|
+
# defined as /^(http|https):\/\/[a-z0-9]
|
34
|
+
# +([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}
|
35
|
+
# (([0-9]{1,5})?\/.*)?$/ix
|
26
36
|
class Url < Token; end
|
27
37
|
|
28
38
|
# Represents a valid RFC822 address.
|
39
|
+
# This is (imperfectly) defined as
|
40
|
+
# /.+\@.+\..+/ (fixme maybe?)
|
29
41
|
class Email < Token; end
|
30
42
|
|
31
43
|
# Represents a token whose type
|
data/lib/treat/entities/zone.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
module Treat::Entities
|
2
|
-
# Represents a zone of text
|
3
|
-
|
4
|
-
class Zone < Treat::Entities::Entity; end
|
2
|
+
# Represents a zone of text.
|
3
|
+
class Zone < Entity; end
|
5
4
|
|
6
|
-
# Represents a title, subtitle,
|
5
|
+
# Represents a title, subtitle,
|
6
|
+
# logical header of a text.
|
7
7
|
class Title < Zone; end
|
8
8
|
|
9
|
-
# Represents a paragraph
|
9
|
+
# Represents a paragraph (group
|
10
|
+
# of sentences and/or phrases).
|
10
11
|
class Paragraph < Zone; end
|
11
12
|
end
|
@@ -2,28 +2,27 @@
|
|
2
2
|
# registered with the Linguistics gem.
|
3
3
|
class Treat::Loaders::Linguistics
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
# Linguistics throws warnings; silence them.
|
6
|
+
silence_warnings { require 'linguistics' }
|
7
|
+
|
8
|
+
# Linguistics classes for each language.
|
9
9
|
@@languages = {}
|
10
10
|
|
11
|
+
# Load the Linguistics class that corresponds
|
12
|
+
# to the supplied language; raises an exception
|
13
|
+
# if there is no such language class registered.
|
11
14
|
def self.load(language)
|
12
|
-
|
13
|
-
|
15
|
+
silence_warnings do
|
16
|
+
@@languages[language] ||=
|
17
|
+
::Linguistics.const_get(
|
18
|
+
language.to_s[0..1].upcase)
|
14
19
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
rescue RuntimeError
|
22
|
-
raise "Ruby Linguistics does " +
|
23
|
-
"not have a module installed " +
|
24
|
-
"for the #{language} language."
|
25
|
-
end
|
26
|
-
|
20
|
+
return @@languages[language]
|
21
|
+
rescue RuntimeError
|
22
|
+
raise Treat::Exception,
|
23
|
+
"Ruby Linguistics does " +
|
24
|
+
"not have a module installed " +
|
25
|
+
"for the #{language} language."
|
27
26
|
end
|
28
27
|
|
29
|
-
end
|
28
|
+
end
|
@@ -1,10 +1,11 @@
|
|
1
|
-
# A helper class to load the
|
2
|
-
# Stanford Core NLP package.
|
1
|
+
# A helper class to load the CoreNLP package.
|
3
2
|
class Treat::Loaders::Stanford
|
4
3
|
|
5
4
|
require 'stanford-core-nlp'
|
5
|
+
|
6
6
|
@@loaded = false
|
7
7
|
|
8
|
+
# Load CoreNLP package for a given language.
|
8
9
|
def self.load(language = nil)
|
9
10
|
return if @@loaded
|
10
11
|
language ||= Treat.core.language.default
|
data/lib/treat/version.rb
CHANGED
@@ -1,61 +1,57 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
if options[:bias_toward].include?(l)
|
51
|
-
return l
|
52
|
-
end
|
1
|
+
# Adaptor for the 'whatlanguage' gem, which
|
2
|
+
# performs probabilistic language detection.
|
3
|
+
# The library works by checking for the presence
|
4
|
+
# of words with bloom filters built from
|
5
|
+
# dictionaries based upon each source language.
|
6
|
+
module Treat::Workers::Extractors::Language::WhatLanguage
|
7
|
+
|
8
|
+
# Require the 'whatlanguage' gem.
|
9
|
+
silence_warnings { require 'whatlanguage' }
|
10
|
+
|
11
|
+
# Undefine the method defined by the gem.
|
12
|
+
String.class_eval { undef :language }
|
13
|
+
|
14
|
+
# By default, bias towards common languages.
|
15
|
+
DefaultOptions = {
|
16
|
+
:bias_toward => [:english, :french, :chinese, :german, :arabic, :spanish]
|
17
|
+
}
|
18
|
+
|
19
|
+
# Keep only once instance of the gem class.
|
20
|
+
@@detector = nil
|
21
|
+
|
22
|
+
# Detect the language of an entity using the
|
23
|
+
# 'whatlanguage' gem. Return an identifier
|
24
|
+
# corresponding to the ISO-639-2 code for the
|
25
|
+
# language.
|
26
|
+
#
|
27
|
+
# Options:
|
28
|
+
#
|
29
|
+
# - (Array of Symbols) bias => Languages to bias
|
30
|
+
# toward when more than one language is detected
|
31
|
+
# with equal probability.
|
32
|
+
def self.language(entity, options = {})
|
33
|
+
|
34
|
+
options = DefaultOptions.merge(options)
|
35
|
+
|
36
|
+
@@detector ||= ::WhatLanguage.new(:possibilities)
|
37
|
+
possibilities = @@detector.process_text(entity.to_s)
|
38
|
+
lang = {}
|
39
|
+
|
40
|
+
possibilities.each do |k,v|
|
41
|
+
lang[k.intern] = v
|
42
|
+
end
|
43
|
+
|
44
|
+
max = lang.values.max
|
45
|
+
ordered = lang.select { |i,j| j == max }.keys
|
46
|
+
|
47
|
+
ordered.each do |l|
|
48
|
+
if options[:bias_toward].include?(l)
|
49
|
+
return l
|
53
50
|
end
|
54
|
-
|
55
|
-
return ordered.first
|
56
|
-
|
57
51
|
end
|
58
|
-
|
52
|
+
|
53
|
+
return ordered.first
|
54
|
+
|
59
55
|
end
|
60
56
|
|
61
|
-
end
|
57
|
+
end
|
@@ -12,24 +12,27 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
|
12
12
|
|
13
13
|
pp = nil
|
14
14
|
|
15
|
-
|
15
|
+
language = entity.language
|
16
16
|
|
17
|
-
Treat::Loaders::Stanford.load(
|
17
|
+
Treat::Loaders::Stanford.load(language)
|
18
18
|
|
19
19
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
20
20
|
tokens = isolated_token ? [entity] : entity.tokens
|
21
21
|
|
22
22
|
ms = StanfordCoreNLP::Config::Models[:ner][language]
|
23
|
-
|
23
|
+
model_path = Treat.libraries.stanford.model_path ||
|
24
|
+
(Treat.paths.models + '/stanford/')
|
25
|
+
ms = model_path + '/' +
|
24
26
|
StanfordCoreNLP::Config::ModelFolders[:ner] +
|
25
27
|
ms['3class']
|
26
28
|
|
27
|
-
@@classifiers[
|
29
|
+
@@classifiers[language] ||=
|
28
30
|
StanfordCoreNLP::CRFClassifier.
|
29
31
|
getClassifier(ms)
|
30
32
|
|
31
33
|
token_list = StanfordCoreNLP.get_list(tokens)
|
32
|
-
sentence = @@classifiers[
|
34
|
+
sentence = @@classifiers[language].
|
35
|
+
classify_sentence(token_list)
|
33
36
|
|
34
37
|
i = 0
|
35
38
|
n = 0
|