treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
data/bin/INFO
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
This is where Treat will look for the Stanford JAR files by default. You can change this to another directory by setting Treat.bin = '/path/to/your/folder/' at runtime.
|
data/examples/benchmark.rb
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
require 'benchmark'
|
2
|
-
require 'treat'
|
3
|
-
|
4
|
-
Benchmark.bmbm do |x|
|
5
|
-
|
6
|
-
Treat.edulcorate
|
7
|
-
|
8
|
-
=begin
|
9
|
-
# Readers
|
10
|
-
x.report("Read:PDF") { doc = Document 'pages/hhmm_article.pdf'; doc.read }
|
11
|
-
x.report("Read:TXT") { doc = Document 'pages/kant_short.txt'; doc.read }
|
12
|
-
x.report("Read:YAML") { doc = Document 'pages/nanotechnology_article.yml'; doc.read }
|
13
|
-
# x.report("Read:XML") { doc = Document 'pages/test.xml'; doc.read }
|
14
|
-
x.report("Read:Image") { doc = Document 'pages/novel_page.jpg'; doc.read }
|
15
|
-
|
16
|
-
# Read collection of texts.
|
17
|
-
coll = Collection 'pages'
|
18
|
-
coll.read
|
19
|
-
=end
|
20
|
-
|
21
|
-
# Processors.
|
22
|
-
# x.report("Cluster:LDA") { coll.cluster(:lda) }
|
23
|
-
x.report("Chunk:txt ") { text.chunk(:txt) }
|
24
|
-
x.report("Segment:punkt ") { text.segment(:punkt) }
|
25
|
-
x.report("Segment:tactful ") { text.segment(:tactful) }
|
26
|
-
x.report("Segment:stanford ") { text.segment(:stanford) }
|
27
|
-
x.report("Tokenize:macintyre ") { text.tokenize(:macintyre) }
|
28
|
-
x.report("Tokenize:multilingual "){ text.tokenize(:multilingual) }
|
29
|
-
x.report("Tokenize:perl "){ text.tokenize(:perl) }
|
30
|
-
x.report("Tokenize:stanford ") { text.tokenize(:stanford) }
|
31
|
-
x.report("Parse:enju") { text = text.parse(:enju) }
|
32
|
-
# x.report("Parse:stanford") { text = text.parse(:stanford) }
|
33
|
-
# x.report("Parse:link") { text = text.parse(:link) }
|
34
|
-
|
35
|
-
doc = Document 'pages/kant_short.txt'
|
36
|
-
text = doc.read.text.chunk.segment.tokenize
|
37
|
-
|
38
|
-
# Formatters.
|
39
|
-
yaml = nil; xml = nil
|
40
|
-
x.report("Serialize:yaml") { yaml = text.serialize(:yaml) }
|
41
|
-
x.report("Serialize:xml") { xml = text.serialize(:xml) }
|
42
|
-
x.report("Visualize:tree") { text.visualize(:tree) }
|
43
|
-
x.report("Visualize:txt") { text.visualize(:txt) }
|
44
|
-
# x.report("Visualize:dot") { text.visualize(:dot) }
|
45
|
-
# x.report("Visualize:standoff") { text.visualize(:standoff) }
|
46
|
-
# x.report("Visualize:simple_html") { text.visualize(:html) }
|
47
|
-
# Clean: html
|
48
|
-
|
49
|
-
# Detectors
|
50
|
-
x.report("Langugage:what_language ") { text.language(:what_language) }
|
51
|
-
x.report("Encoding:r_chardet19 ") { text.encoding(:r_chardet19) }
|
52
|
-
x.report("Format:file ") { text.format(:file) }
|
53
|
-
|
54
|
-
# Extractors
|
55
|
-
x.report("Date:chronic") { '2007/02/12'.date(:chronic) }
|
56
|
-
x.report("Date:native") { '2007/02/12'.date(:native) }
|
57
|
-
x.report("Time:chronic") { '2007/02/12'.time(:chronic) }
|
58
|
-
x.report("Topic:reuters") { text.topic }
|
59
|
-
x.report("Statistics:frequency:") { text.each_token { |token| token.statistics(:frequency) } }
|
60
|
-
# x.report("Statistics:position:") { text.each_token { |token| token.statistics(:position) } }
|
61
|
-
|
62
|
-
# Inflectors
|
63
|
-
# x.report("Lemma:elemma") { text.each_word { |word| word.lemma(:elemma) } }
|
64
|
-
x.report("Stem:porter_r") { text.each_word { |word| word.stem(:porter) } }
|
65
|
-
x.report("Stem:porter_c") { text.each_word { |word| word.stem(:porter_c) } }
|
66
|
-
x.report("Stem:uea") { text.each_word { |word| word.stem(:uea) } }
|
67
|
-
x.report("Declense:granger") { text.each_word { |word| word.declense(:granger) } }
|
68
|
-
# x.report("Inflect:granger") { text.each_noun { |word| word.plural(:granger) } }
|
69
|
-
|
70
|
-
# Statistics
|
71
|
-
x.report("Entity:word_count") { text.word_count }
|
72
|
-
|
73
|
-
# puts text.words_with_cat(:noun).inspect
|
74
|
-
|
75
|
-
# Lexicalizers
|
76
|
-
x.report("Tag:stanford") { text.each_word { |word| word.tag(:stanford) } }
|
77
|
-
# x.report("Tag:brill") { text.each_word { |word| word.tag(:brill) } }
|
78
|
-
# x.report("Tag:lingua") { text.each_word { |word| word.tag(:lingua) } }
|
79
|
-
# x.report("Lex:wordnet") { text.each_word { |word| word.lex(:wordnet) } }
|
80
|
-
|
81
|
-
end
|
data/examples/keywords.rb
DELETED
@@ -1,148 +0,0 @@
|
|
1
|
-
require 'benchmark'
|
2
|
-
require 'treat'
|
3
|
-
|
4
|
-
Treat.edulcorate
|
5
|
-
|
6
|
-
c = Collection.from_serialized('texts/corpus.yml')
|
7
|
-
|
8
|
-
=begin
|
9
|
-
c.each_text do |t|
|
10
|
-
t.chunk.segment.parse(:stanford)
|
11
|
-
puts "Done text #{t.id}."
|
12
|
-
end
|
13
|
-
|
14
|
-
c.serialize(:yaml).save("economist/corpus.yml")
|
15
|
-
|
16
|
-
=end
|
17
|
-
|
18
|
-
topic_words = c.topic_words(
|
19
|
-
:lda,
|
20
|
-
:topics => 5,
|
21
|
-
:words_per_topic => 5,
|
22
|
-
:iterations => 20
|
23
|
-
)
|
24
|
-
|
25
|
-
c.each_document do |d|
|
26
|
-
|
27
|
-
sentences = d.key_sentences(
|
28
|
-
:topics_frequency,
|
29
|
-
:topic_words => topic_words,
|
30
|
-
:threshold => 4
|
31
|
-
)
|
32
|
-
|
33
|
-
tm = d.statistics(
|
34
|
-
:transition_matrix,
|
35
|
-
:features => [:tag],
|
36
|
-
:entity_type => :word,
|
37
|
-
:condition => lambda do |word|
|
38
|
-
word.has?(:is_keyword?) &&
|
39
|
-
word.is_keyword?
|
40
|
-
end
|
41
|
-
)
|
42
|
-
|
43
|
-
sentences.each do |sentence|
|
44
|
-
sentence.each_word do |word|
|
45
|
-
score = word.statistics(
|
46
|
-
:transition_probability,
|
47
|
-
:transition_matrix => tm,
|
48
|
-
:relationships => [:parent, :left, :right, :children]
|
49
|
-
)
|
50
|
-
if word.has?(:is_keyword?) &&
|
51
|
-
word.is_keyword?
|
52
|
-
score += 0.5
|
53
|
-
end
|
54
|
-
if score > 1
|
55
|
-
puts word.to_s
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
Treat.edulcorate
|
65
|
-
Treat.bin = '/ruby/nat/bin'
|
66
|
-
|
67
|
-
c = Collection 'economist'
|
68
|
-
c.each_document { |doc| doc.chunk.segment.tokenize }
|
69
|
-
|
70
|
-
topic_words = c.topic_words(
|
71
|
-
:lda,
|
72
|
-
:topics => 5,
|
73
|
-
:words_per_topic => 5,
|
74
|
-
:iterations => 20
|
75
|
-
)
|
76
|
-
|
77
|
-
keywords = c.keywords(
|
78
|
-
:topics_frequency,
|
79
|
-
:topic_words => topic_words,
|
80
|
-
:tf_idf_threshold => 180
|
81
|
-
)
|
82
|
-
|
83
|
-
puts keywords.inspect
|
84
|
-
|
85
|
-
abort
|
86
|
-
|
87
|
-
c = Phrase 'a test clause'
|
88
|
-
c.parse
|
89
|
-
puts c.visualize(:tree)
|
90
|
-
puts c.visualize(:inspect)
|
91
|
-
puts c.visualize(:short_value)
|
92
|
-
puts c.visualize(:standoff)
|
93
|
-
puts c.visualize(:tree)
|
94
|
-
|
95
|
-
c.serialize(:yaml).save('test.yml')
|
96
|
-
c.serialize(:xml).save('test.xml')
|
97
|
-
|
98
|
-
d = Phrase 'test.yml'
|
99
|
-
d.print_tree
|
100
|
-
d = Phrase 'test.xml'
|
101
|
-
d.print_tree
|
102
|
-
|
103
|
-
puts d.words[0].position_in_parent
|
104
|
-
abort
|
105
|
-
|
106
|
-
w = Word 'running'
|
107
|
-
puts w.stem(:porter_c)
|
108
|
-
puts w.stem(:porter)
|
109
|
-
puts w.stem(:uea)
|
110
|
-
|
111
|
-
w = Word 'run'
|
112
|
-
|
113
|
-
puts w.infinitive(:linguistics)
|
114
|
-
puts w.present_participle(:linguistics)
|
115
|
-
puts w.plural(:linguistics)
|
116
|
-
|
117
|
-
w = Word 'table'
|
118
|
-
|
119
|
-
puts w.synonyms.inspect
|
120
|
-
puts w.antonyms.inspect
|
121
|
-
puts w.hyponyms.inspect
|
122
|
-
puts w.hypernyms.inspect
|
123
|
-
|
124
|
-
n = Number 2
|
125
|
-
puts n.ordinal_words(:linguistics)
|
126
|
-
puts n.cardinal_words(:linguistics)
|
127
|
-
|
128
|
-
s = Sentence 'A sentence to parse.'
|
129
|
-
s.dup.parse(:enju).print_tree
|
130
|
-
s.dup.parse(:stanford).print_tree
|
131
|
-
|
132
|
-
s = Sentence 'A sentence to tokenize'
|
133
|
-
s.dup.tokenize(:macintyre).print_tree
|
134
|
-
s.dup.tokenize(:multilingual).print_tree
|
135
|
-
s.dup.tokenize(:perl).print_tree
|
136
|
-
s.dup.tokenize(:punkt).print_tree
|
137
|
-
s.dup.tokenize(:stanford).print_tree
|
138
|
-
s.dup.tokenize(:tactful).print_tree
|
139
|
-
|
140
|
-
|
141
|
-
=begin
|
142
|
-
c = Collection 'economist'
|
143
|
-
# c.each_document { |d| d.chunk.segment.tokenize }
|
144
|
-
c.documents[0].chunk.segment
|
145
|
-
c.sentences[0].parse(:enju)
|
146
|
-
c.each_word { |word| word.stem }
|
147
|
-
c.visualize(:dot, features: [:tag]).save('test.dot')
|
148
|
-
=end
|
data/lib/treat/detectors.rb
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
# Detectors detect a specific meta-information about
|
3
|
-
# an entity, such as encoding, format and language.
|
4
|
-
#
|
5
|
-
# Detectors are language-independent, and thus there
|
6
|
-
# are default algorithms specified for each of them.
|
7
|
-
module Detectors
|
8
|
-
# Group for algorithms that detect encoding.
|
9
|
-
module Encoding
|
10
|
-
extend Group
|
11
|
-
self.type = :annotator
|
12
|
-
self.targets = [:document]
|
13
|
-
self.default = :r_chardet19
|
14
|
-
end
|
15
|
-
# Group for algorithms that support format detection.
|
16
|
-
module Format
|
17
|
-
extend Group
|
18
|
-
self.type = :annotator
|
19
|
-
self.targets = [:document]
|
20
|
-
self.default = :file
|
21
|
-
end
|
22
|
-
# Group for algorithms that do language detection.
|
23
|
-
module Language
|
24
|
-
extend Group
|
25
|
-
self.type = :annotator
|
26
|
-
self.targets = [:entity]
|
27
|
-
self.default = :what_language
|
28
|
-
end
|
29
|
-
extend Treat::Category
|
30
|
-
end
|
31
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Detectors
|
3
|
-
module Encoding
|
4
|
-
# Require the 'rchardet19' gem.
|
5
|
-
silence_warnings { require 'rchardet19' }
|
6
|
-
# A wrapper for the 'rchardet19' gem, which
|
7
|
-
# detects the encoding of a file.
|
8
|
-
class RChardet19
|
9
|
-
# Returns the encoding of the document according
|
10
|
-
# to the 'rchardet19' gem.
|
11
|
-
#
|
12
|
-
# Options: none.
|
13
|
-
def self.encoding(document, options={})
|
14
|
-
r = CharDet.detect(document.file)
|
15
|
-
if r.encoding
|
16
|
-
Treat::Feature.new({
|
17
|
-
r.encoding.
|
18
|
-
gsub('-', '_').downcase.intern =>
|
19
|
-
r.confidence}).best
|
20
|
-
else
|
21
|
-
:unknown
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
@@ -1,36 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Detectors
|
3
|
-
module Format
|
4
|
-
# A wrapper for the *NIX 'file' command,
|
5
|
-
# witch uses etc/magic to detect the format
|
6
|
-
# of a file.
|
7
|
-
class File
|
8
|
-
# Returns an identifier representing
|
9
|
-
# the format of a file using the *NIX
|
10
|
-
# 'file' command.
|
11
|
-
#
|
12
|
-
# Options: none.
|
13
|
-
def self.format(entity, options = {})
|
14
|
-
format = nil
|
15
|
-
create_temp_file(:txt, entity.to_s) do |tmp|
|
16
|
-
format = `file #{tmp}`
|
17
|
-
end
|
18
|
-
if format.scan('text')
|
19
|
-
:txt
|
20
|
-
elsif format.scan('XML')
|
21
|
-
:xml
|
22
|
-
elsif format.scan('HTML')
|
23
|
-
:html
|
24
|
-
elsif format.scan('image')
|
25
|
-
:image
|
26
|
-
elsif format.scan('PDF')
|
27
|
-
:pdf
|
28
|
-
else
|
29
|
-
raise Treat::Exception,
|
30
|
-
"Unsupported text format #{format}."
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Detectors
|
3
|
-
module Language
|
4
|
-
# Require the 'whatlanguage' gem.
|
5
|
-
silence_warnings { require 'whatlanguage' }
|
6
|
-
# Adaptor for the 'whatlanguage' gem, which
|
7
|
-
# performs probabilistic language detection.
|
8
|
-
class WhatLanguage < LanguageDetector
|
9
|
-
# Keep only once instance of the gem class.
|
10
|
-
@@detector = nil
|
11
|
-
# Detect the language of an entity using the
|
12
|
-
# 'whatlanguage' gem. Return an identifier
|
13
|
-
# corresponding to the ISO-639-2 code for the
|
14
|
-
# language.
|
15
|
-
def self.language(entity, options = {})
|
16
|
-
predetection = super(entity, options)
|
17
|
-
return predetection if predetection
|
18
|
-
@@detector ||= ::WhatLanguage.new(:possibilities)
|
19
|
-
possibilities = @@detector.process_text(entity.to_s)
|
20
|
-
lang = {}
|
21
|
-
possibilities.each do |k,v|
|
22
|
-
lang[Treat::Languages.find(k)] = v
|
23
|
-
end
|
24
|
-
Treat::Feature.new(lang).best
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
@@ -1,15 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Entities
|
3
|
-
# Represents any syntactic constituent
|
4
|
-
# of a sentence.
|
5
|
-
class Constituent < Entity
|
6
|
-
end
|
7
|
-
# Represents a phrase inside a sentence
|
8
|
-
# or by itself.
|
9
|
-
class Phrase < Constituent
|
10
|
-
end
|
11
|
-
# Represents a clause inside a sentence.
|
12
|
-
class Clause < Constituent
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module NamedEntity
|
4
|
-
class Abner
|
5
|
-
# Require the Ruby-Java bridge.
|
6
|
-
silence_warnings do
|
7
|
-
require 'rjb'
|
8
|
-
Rjb::load('', ['-Xms256M', '-Xmx512M'])
|
9
|
-
puts Rjb.import('tagger')
|
10
|
-
end
|
11
|
-
@@tagger = nil
|
12
|
-
def self.named_entity(entity)
|
13
|
-
@@tagger ||= AbnerTagger.new
|
14
|
-
@@tagger.tokenize(entity)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
@@ -1,174 +0,0 @@
|
|
1
|
-
module Treat
|
2
|
-
module Extractors
|
3
|
-
module NamedEntity
|
4
|
-
class Stanford
|
5
|
-
# Require the Ruby-Java bridge.
|
6
|
-
silence_warnings do
|
7
|
-
require 'rjb'
|
8
|
-
Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
|
9
|
-
Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
|
10
|
-
Rjb::add_jar('/ruby/treat/bin/stanford/xom.jar')
|
11
|
-
Rjb::add_jar('/ruby/treat/bin/stanford/joda-time.jar')
|
12
|
-
Rjb::add_jar('/ruby/treat/bin/stanford/stanford-corenlp.jar')
|
13
|
-
StanfordCoreNLP = Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP')
|
14
|
-
Annotation = Rjb::import('edu.stanford.nlp.pipeline.Annotation')
|
15
|
-
NamedEntityTagAnnotation = Rjb::import('edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation')
|
16
|
-
Properties = Rjb::import('java.util.Properties')
|
17
|
-
end
|
18
|
-
@@classifier = nil
|
19
|
-
def self.named_entity(entity, options = {})
|
20
|
-
properties = Properties.new
|
21
|
-
properties.set_property('annotators', 'tokenize, ssplit, pos, lemma, ner')
|
22
|
-
properties.set_property('pos.model', '/ruby/treat/bin/stanford/taggers/english-left3words-distsim.tagger')
|
23
|
-
properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
|
24
|
-
properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
|
25
|
-
properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
|
26
|
-
properties.set_property('parser.model', '/ruby/treat/bin/stanford-parser/grammar/englishPCFG.ser.gz')
|
27
|
-
silence_stream(STDOUT) do
|
28
|
-
pipeline = StanfordCoreNLP.new(properties)
|
29
|
-
end
|
30
|
-
stanford_entity = Annotation.new(entity.to_s)
|
31
|
-
pipeline.annotate(stanford_entity)
|
32
|
-
puts stanford_entity.java_methods
|
33
|
-
puts stanford_entity.get_string(NamedEntityTagAnnotation)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
=begin
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
CRFBiasedClassifier = Rjb::import('edu.stanford.nlp.ie.crf.CRFBiasedClassifier')
|
46
|
-
Properties = Rjb::import('java.util.Properties')
|
47
|
-
List = ::Rjb::import('java.util.ArrayList')
|
48
|
-
Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
|
49
|
-
CoreAnnotations = ::Rjb::import('edu.stanford.nlp.ling.CoreAnnotations')
|
50
|
-
if @@classifier == nil
|
51
|
-
properties = Properties.new
|
52
|
-
options.each_pair do |option,value|
|
53
|
-
#properties.set_property('trainFile', )... Set the options.
|
54
|
-
end
|
55
|
-
@@classifier = CRFBiasedClassifier.new(properties)
|
56
|
-
@@classifier.load_classifier("/ruby/treat/bin/stanford_ner/classifiers/conll.4class.distsim.crf.ser.gz")
|
57
|
-
end
|
58
|
-
w = Word.new('Obama')
|
59
|
-
#puts @@classifier.java_methods
|
60
|
-
puts CoreAnnotations.public_methods.inspect
|
61
|
-
puts @@classifier.classify(w).get()
|
62
|
-
|
63
|
-
|
64
|
-
/*
|
65
|
-
* To change this template, choose Tools | Templates
|
66
|
-
* and open the template in the editor.
|
67
|
-
*/
|
68
|
-
|
69
|
-
package corenlp;
|
70
|
-
import edu.stanford.nlp.ling.CoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
|
71
|
-
import edu.stanford.nlp.ling.CoreAnnotations.CorefGraphAnnotation;
|
72
|
-
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
|
73
|
-
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
|
74
|
-
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
|
75
|
-
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
|
76
|
-
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
|
77
|
-
import edu.stanford.nlp.ling.CoreAnnotations.TreeAnnotation;
|
78
|
-
import edu.stanford.nlp.ling.CoreLabel;
|
79
|
-
import edu.stanford.nlp.pipeline.*;
|
80
|
-
import edu.stanford.nlp.trees.Tree;
|
81
|
-
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
|
82
|
-
import edu.stanford.nlp.util.CoreMap;
|
83
|
-
import edu.stanford.nlp.util.IntTuple;
|
84
|
-
import edu.stanford.nlp.util.Pair;
|
85
|
-
import edu.stanford.nlp.util.Timing;
|
86
|
-
import java.io.File;
|
87
|
-
import java.io.FileInputStream;
|
88
|
-
import java.io.IOException;
|
89
|
-
import java.util.ArrayList;
|
90
|
-
import java.util.List;
|
91
|
-
|
92
|
-
import java.util.Properties;
|
93
|
-
/**
|
94
|
-
*
|
95
|
-
* @author Karthi
|
96
|
-
*/
|
97
|
-
public class Main {
|
98
|
-
|
99
|
-
/**
|
100
|
-
* @param args the command line arguments
|
101
|
-
*/
|
102
|
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
|
103
|
-
// // TODO code application liogic here
|
104
|
-
// System.out.println(System.getProperty("sun.arch.data.model"));
|
105
|
-
//// String str="-cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:jgrapht-0.7.3.jar -Xms3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file <input.txt>";
|
106
|
-
//// args=str.split(" ");
|
107
|
-
//// StanfordCoreNLP.main(args);
|
108
|
-
// Timing tim = new Timing();
|
109
|
-
// Properties props = null;
|
110
|
-
// props.setProperty("annotators", "ssplit, ner, parse, dcoref");
|
111
|
-
//
|
112
|
-
// StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
|
113
|
-
// props = pipeline.getProperties();
|
114
|
-
// long setupTime = tim.report();
|
115
|
-
// String fileName = "input.txt";
|
116
|
-
// ArrayList<File> files=null;
|
117
|
-
// files.add(new File(filename));
|
118
|
-
// pipeline.processFiles(pipeline, files, props);
|
119
|
-
//
|
120
|
-
//
|
121
|
-
|
122
|
-
|
123
|
-
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
|
124
|
-
Properties props = new Properties();
|
125
|
-
FileInputStream in = new FileInputStream("Main.properties");
|
126
|
-
|
127
|
-
props.load(in);
|
128
|
-
in.close();
|
129
|
-
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
|
130
|
-
|
131
|
-
// read some text in the text variable
|
132
|
-
String text = "The doctor can consult with other doctors about this patient. If that is the case, the name of the doctor and the names of the consultants have to be maintained. Otherwise, only the name of the doctor is kept. "; // Add your text here!
|
133
|
-
|
134
|
-
// create an empty Annotation just with the given text
|
135
|
-
Annotation document = new Annotation(text);
|
136
|
-
|
137
|
-
// run all Annotators on this text
|
138
|
-
pipeline.annotate(document);
|
139
|
-
System.out.println(document);
|
140
|
-
|
141
|
-
// these are all the sentences in this document
|
142
|
-
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
|
143
|
-
List<CoreMap> sentences = (List<CoreMap>) document.get(SentencesAnnotation.class);
|
144
|
-
System.out.println(sentences);
|
145
|
-
for(CoreMap sentence: sentences) {
|
146
|
-
// traversing the words in the current sentence
|
147
|
-
// a CoreLabel is a CoreMap with additional token-specific methods
|
148
|
-
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
|
149
|
-
// this is the text of the token
|
150
|
-
String word = token.get(TextAnnotation.class);
|
151
|
-
// this is the POS tag of the token
|
152
|
-
String pos = token.get(PartOfSpeechAnnotation.class);
|
153
|
-
// this is the NER label of the token
|
154
|
-
String ne = token.get(NamedEntityTagAnnotation.class);
|
155
|
-
}
|
156
|
-
|
157
|
-
// this is the parse tree of the current sentence
|
158
|
-
Tree tree = sentence.get(TreeAnnotation.class);
|
159
|
-
System.out.println(tree);
|
160
|
-
// this is the Stanford dependency graph of the current sentence
|
161
|
-
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
|
162
|
-
System.out.println(dependencies);
|
163
|
-
}
|
164
|
-
|
165
|
-
// this is the coreference link graph
|
166
|
-
// each link stores an arc in the graph; the first element in the Pair is the source, the second is the target
|
167
|
-
// each node is stored as <sentence id, token id>. Both offsets start at 1!
|
168
|
-
List<Pair<IntTuple, IntTuple>> graph = document.get(CorefGraphAnnotation.class);
|
169
|
-
System.out.println(graph);
|
170
|
-
|
171
|
-
}
|
172
|
-
|
173
|
-
}
|
174
|
-
=end
|