treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
data/bin/INFO DELETED
@@ -1 +0,0 @@
1
- This is where Treat will look for the Stanford JAR files by default. You can change this to another directory by setting Treat.bin = '/path/to/your/folder/' at runtime.
@@ -1,81 +0,0 @@
1
- require 'benchmark'
2
- require 'treat'
3
-
4
- Benchmark.bmbm do |x|
5
-
6
- Treat.edulcorate
7
-
8
- =begin
9
- # Readers
10
- x.report("Read:PDF") { doc = Document 'pages/hhmm_article.pdf'; doc.read }
11
- x.report("Read:TXT") { doc = Document 'pages/kant_short.txt'; doc.read }
12
- x.report("Read:YAML") { doc = Document 'pages/nanotechnology_article.yml'; doc.read }
13
- # x.report("Read:XML") { doc = Document 'pages/test.xml'; doc.read }
14
- x.report("Read:Image") { doc = Document 'pages/novel_page.jpg'; doc.read }
15
-
16
- # Read collection of texts.
17
- coll = Collection 'pages'
18
- coll.read
19
- =end
20
-
21
- # Processors.
22
- # x.report("Cluster:LDA") { coll.cluster(:lda) }
23
- x.report("Chunk:txt ") { text.chunk(:txt) }
24
- x.report("Segment:punkt ") { text.segment(:punkt) }
25
- x.report("Segment:tactful ") { text.segment(:tactful) }
26
- x.report("Segment:stanford ") { text.segment(:stanford) }
27
- x.report("Tokenize:macintyre ") { text.tokenize(:macintyre) }
28
- x.report("Tokenize:multilingual "){ text.tokenize(:multilingual) }
29
- x.report("Tokenize:perl "){ text.tokenize(:perl) }
30
- x.report("Tokenize:stanford ") { text.tokenize(:stanford) }
31
- x.report("Parse:enju") { text = text.parse(:enju) }
32
- # x.report("Parse:stanford") { text = text.parse(:stanford) }
33
- # x.report("Parse:link") { text = text.parse(:link) }
34
-
35
- doc = Document 'pages/kant_short.txt'
36
- text = doc.read.text.chunk.segment.tokenize
37
-
38
- # Formatters.
39
- yaml = nil; xml = nil
40
- x.report("Serialize:yaml") { yaml = text.serialize(:yaml) }
41
- x.report("Serialize:xml") { xml = text.serialize(:xml) }
42
- x.report("Visualize:tree") { text.visualize(:tree) }
43
- x.report("Visualize:txt") { text.visualize(:txt) }
44
- # x.report("Visualize:dot") { text.visualize(:dot) }
45
- # x.report("Visualize:standoff") { text.visualize(:standoff) }
46
- # x.report("Visualize:simple_html") { text.visualize(:html) }
47
- # Clean: html
48
-
49
- # Detectors
50
- x.report("Langugage:what_language ") { text.language(:what_language) }
51
- x.report("Encoding:r_chardet19 ") { text.encoding(:r_chardet19) }
52
- x.report("Format:file ") { text.format(:file) }
53
-
54
- # Extractors
55
- x.report("Date:chronic") { '2007/02/12'.date(:chronic) }
56
- x.report("Date:native") { '2007/02/12'.date(:native) }
57
- x.report("Time:chronic") { '2007/02/12'.time(:chronic) }
58
- x.report("Topic:reuters") { text.topic }
59
- x.report("Statistics:frequency:") { text.each_token { |token| token.statistics(:frequency) } }
60
- # x.report("Statistics:position:") { text.each_token { |token| token.statistics(:position) } }
61
-
62
- # Inflectors
63
- # x.report("Lemma:elemma") { text.each_word { |word| word.lemma(:elemma) } }
64
- x.report("Stem:porter_r") { text.each_word { |word| word.stem(:porter) } }
65
- x.report("Stem:porter_c") { text.each_word { |word| word.stem(:porter_c) } }
66
- x.report("Stem:uea") { text.each_word { |word| word.stem(:uea) } }
67
- x.report("Declense:granger") { text.each_word { |word| word.declense(:granger) } }
68
- # x.report("Inflect:granger") { text.each_noun { |word| word.plural(:granger) } }
69
-
70
- # Statistics
71
- x.report("Entity:word_count") { text.word_count }
72
-
73
- # puts text.words_with_cat(:noun).inspect
74
-
75
- # Lexicalizers
76
- x.report("Tag:stanford") { text.each_word { |word| word.tag(:stanford) } }
77
- # x.report("Tag:brill") { text.each_word { |word| word.tag(:brill) } }
78
- # x.report("Tag:lingua") { text.each_word { |word| word.tag(:lingua) } }
79
- # x.report("Lex:wordnet") { text.each_word { |word| word.lex(:wordnet) } }
80
-
81
- end
@@ -1,148 +0,0 @@
1
- require 'benchmark'
2
- require 'treat'
3
-
4
- Treat.edulcorate
5
-
6
- c = Collection.from_serialized('texts/corpus.yml')
7
-
8
- =begin
9
- c.each_text do |t|
10
- t.chunk.segment.parse(:stanford)
11
- puts "Done text #{t.id}."
12
- end
13
-
14
- c.serialize(:yaml).save("economist/corpus.yml")
15
-
16
- =end
17
-
18
- topic_words = c.topic_words(
19
- :lda,
20
- :topics => 5,
21
- :words_per_topic => 5,
22
- :iterations => 20
23
- )
24
-
25
- c.each_document do |d|
26
-
27
- sentences = d.key_sentences(
28
- :topics_frequency,
29
- :topic_words => topic_words,
30
- :threshold => 4
31
- )
32
-
33
- tm = d.statistics(
34
- :transition_matrix,
35
- :features => [:tag],
36
- :entity_type => :word,
37
- :condition => lambda do |word|
38
- word.has?(:is_keyword?) &&
39
- word.is_keyword?
40
- end
41
- )
42
-
43
- sentences.each do |sentence|
44
- sentence.each_word do |word|
45
- score = word.statistics(
46
- :transition_probability,
47
- :transition_matrix => tm,
48
- :relationships => [:parent, :left, :right, :children]
49
- )
50
- if word.has?(:is_keyword?) &&
51
- word.is_keyword?
52
- score += 0.5
53
- end
54
- if score > 1
55
- puts word.to_s
56
- end
57
- end
58
- end
59
-
60
- end
61
-
62
-
63
-
64
- Treat.edulcorate
65
- Treat.bin = '/ruby/nat/bin'
66
-
67
- c = Collection 'economist'
68
- c.each_document { |doc| doc.chunk.segment.tokenize }
69
-
70
- topic_words = c.topic_words(
71
- :lda,
72
- :topics => 5,
73
- :words_per_topic => 5,
74
- :iterations => 20
75
- )
76
-
77
- keywords = c.keywords(
78
- :topics_frequency,
79
- :topic_words => topic_words,
80
- :tf_idf_threshold => 180
81
- )
82
-
83
- puts keywords.inspect
84
-
85
- abort
86
-
87
- c = Phrase 'a test clause'
88
- c.parse
89
- puts c.visualize(:tree)
90
- puts c.visualize(:inspect)
91
- puts c.visualize(:short_value)
92
- puts c.visualize(:standoff)
93
- puts c.visualize(:tree)
94
-
95
- c.serialize(:yaml).save('test.yml')
96
- c.serialize(:xml).save('test.xml')
97
-
98
- d = Phrase 'test.yml'
99
- d.print_tree
100
- d = Phrase 'test.xml'
101
- d.print_tree
102
-
103
- puts d.words[0].position_in_parent
104
- abort
105
-
106
- w = Word 'running'
107
- puts w.stem(:porter_c)
108
- puts w.stem(:porter)
109
- puts w.stem(:uea)
110
-
111
- w = Word 'run'
112
-
113
- puts w.infinitive(:linguistics)
114
- puts w.present_participle(:linguistics)
115
- puts w.plural(:linguistics)
116
-
117
- w = Word 'table'
118
-
119
- puts w.synonyms.inspect
120
- puts w.antonyms.inspect
121
- puts w.hyponyms.inspect
122
- puts w.hypernyms.inspect
123
-
124
- n = Number 2
125
- puts n.ordinal_words(:linguistics)
126
- puts n.cardinal_words(:linguistics)
127
-
128
- s = Sentence 'A sentence to parse.'
129
- s.dup.parse(:enju).print_tree
130
- s.dup.parse(:stanford).print_tree
131
-
132
- s = Sentence 'A sentence to tokenize'
133
- s.dup.tokenize(:macintyre).print_tree
134
- s.dup.tokenize(:multilingual).print_tree
135
- s.dup.tokenize(:perl).print_tree
136
- s.dup.tokenize(:punkt).print_tree
137
- s.dup.tokenize(:stanford).print_tree
138
- s.dup.tokenize(:tactful).print_tree
139
-
140
-
141
- =begin
142
- c = Collection 'economist'
143
- # c.each_document { |d| d.chunk.segment.tokenize }
144
- c.documents[0].chunk.segment
145
- c.sentences[0].parse(:enju)
146
- c.each_word { |word| word.stem }
147
- c.visualize(:dot, features: [:tag]).save('test.dot')
148
- =end
@@ -1,31 +0,0 @@
1
- module Treat
2
- # Detectors detect a specific meta-information about
3
- # an entity, such as encoding, format and language.
4
- #
5
- # Detectors are language-independent, and thus there
6
- # are default algorithms specified for each of them.
7
- module Detectors
8
- # Group for algorithms that detect encoding.
9
- module Encoding
10
- extend Group
11
- self.type = :annotator
12
- self.targets = [:document]
13
- self.default = :r_chardet19
14
- end
15
- # Group for algorithms that support format detection.
16
- module Format
17
- extend Group
18
- self.type = :annotator
19
- self.targets = [:document]
20
- self.default = :file
21
- end
22
- # Group for algorithms that do language detection.
23
- module Language
24
- extend Group
25
- self.type = :annotator
26
- self.targets = [:entity]
27
- self.default = :what_language
28
- end
29
- extend Treat::Category
30
- end
31
- end
@@ -1,27 +0,0 @@
1
- module Treat
2
- module Detectors
3
- module Encoding
4
- # Require the 'rchardet19' gem.
5
- silence_warnings { require 'rchardet19' }
6
- # A wrapper for the 'rchardet19' gem, which
7
- # detects the encoding of a file.
8
- class RChardet19
9
- # Returns the encoding of the document according
10
- # to the 'rchardet19' gem.
11
- #
12
- # Options: none.
13
- def self.encoding(document, options={})
14
- r = CharDet.detect(document.file)
15
- if r.encoding
16
- Treat::Feature.new({
17
- r.encoding.
18
- gsub('-', '_').downcase.intern =>
19
- r.confidence}).best
20
- else
21
- :unknown
22
- end
23
- end
24
- end
25
- end
26
- end
27
- end
@@ -1,36 +0,0 @@
1
- module Treat
2
- module Detectors
3
- module Format
4
- # A wrapper for the *NIX 'file' command,
5
- # witch uses etc/magic to detect the format
6
- # of a file.
7
- class File
8
- # Returns an identifier representing
9
- # the format of a file using the *NIX
10
- # 'file' command.
11
- #
12
- # Options: none.
13
- def self.format(entity, options = {})
14
- format = nil
15
- create_temp_file(:txt, entity.to_s) do |tmp|
16
- format = `file #{tmp}`
17
- end
18
- if format.scan('text')
19
- :txt
20
- elsif format.scan('XML')
21
- :xml
22
- elsif format.scan('HTML')
23
- :html
24
- elsif format.scan('image')
25
- :image
26
- elsif format.scan('PDF')
27
- :pdf
28
- else
29
- raise Treat::Exception,
30
- "Unsupported text format #{format}."
31
- end
32
- end
33
- end
34
- end
35
- end
36
- end
@@ -1,29 +0,0 @@
1
- module Treat
2
- module Detectors
3
- module Language
4
- # Require the 'whatlanguage' gem.
5
- silence_warnings { require 'whatlanguage' }
6
- # Adaptor for the 'whatlanguage' gem, which
7
- # performs probabilistic language detection.
8
- class WhatLanguage < LanguageDetector
9
- # Keep only once instance of the gem class.
10
- @@detector = nil
11
- # Detect the language of an entity using the
12
- # 'whatlanguage' gem. Return an identifier
13
- # corresponding to the ISO-639-2 code for the
14
- # language.
15
- def self.language(entity, options = {})
16
- predetection = super(entity, options)
17
- return predetection if predetection
18
- @@detector ||= ::WhatLanguage.new(:possibilities)
19
- possibilities = @@detector.process_text(entity.to_s)
20
- lang = {}
21
- possibilities.each do |k,v|
22
- lang[Treat::Languages.find(k)] = v
23
- end
24
- Treat::Feature.new(lang).best
25
- end
26
- end
27
- end
28
- end
29
- end
@@ -1,15 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents any syntactic constituent
4
- # of a sentence.
5
- class Constituent < Entity
6
- end
7
- # Represents a phrase inside a sentence
8
- # or by itself.
9
- class Phrase < Constituent
10
- end
11
- # Represents a clause inside a sentence.
12
- class Clause < Constituent
13
- end
14
- end
15
- end
@@ -1,8 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents a sentence.
4
- class Sentence < Entity
5
- def subject(l = nil, o = {}); link(l, o.merge({:linkage => :subject})); end
6
- end
7
- end
8
- end
@@ -1,20 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module NamedEntity
4
- class Abner
5
- # Require the Ruby-Java bridge.
6
- silence_warnings do
7
- require 'rjb'
8
- Rjb::load('', ['-Xms256M', '-Xmx512M'])
9
- puts Rjb.import('tagger')
10
- end
11
- @@tagger = nil
12
- def self.named_entity(entity)
13
- @@tagger ||= AbnerTagger.new
14
- @@tagger.tokenize(entity)
15
- end
16
- end
17
- end
18
- end
19
- end
20
-
@@ -1,174 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module NamedEntity
4
- class Stanford
5
- # Require the Ruby-Java bridge.
6
- silence_warnings do
7
- require 'rjb'
8
- Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
9
- Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
10
- Rjb::add_jar('/ruby/treat/bin/stanford/xom.jar')
11
- Rjb::add_jar('/ruby/treat/bin/stanford/joda-time.jar')
12
- Rjb::add_jar('/ruby/treat/bin/stanford/stanford-corenlp.jar')
13
- StanfordCoreNLP = Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP')
14
- Annotation = Rjb::import('edu.stanford.nlp.pipeline.Annotation')
15
- NamedEntityTagAnnotation = Rjb::import('edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation')
16
- Properties = Rjb::import('java.util.Properties')
17
- end
18
- @@classifier = nil
19
- def self.named_entity(entity, options = {})
20
- properties = Properties.new
21
- properties.set_property('annotators', 'tokenize, ssplit, pos, lemma, ner')
22
- properties.set_property('pos.model', '/ruby/treat/bin/stanford/taggers/english-left3words-distsim.tagger')
23
- properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
24
- properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
25
- properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
26
- properties.set_property('parser.model', '/ruby/treat/bin/stanford-parser/grammar/englishPCFG.ser.gz')
27
- silence_stream(STDOUT) do
28
- pipeline = StanfordCoreNLP.new(properties)
29
- end
30
- stanford_entity = Annotation.new(entity.to_s)
31
- pipeline.annotate(stanford_entity)
32
- puts stanford_entity.java_methods
33
- puts stanford_entity.get_string(NamedEntityTagAnnotation)
34
- end
35
- end
36
- end
37
- end
38
- end
39
-
40
-
41
- =begin
42
-
43
-
44
-
45
- CRFBiasedClassifier = Rjb::import('edu.stanford.nlp.ie.crf.CRFBiasedClassifier')
46
- Properties = Rjb::import('java.util.Properties')
47
- List = ::Rjb::import('java.util.ArrayList')
48
- Word = ::Rjb::import('edu.stanford.nlp.ling.Word')
49
- CoreAnnotations = ::Rjb::import('edu.stanford.nlp.ling.CoreAnnotations')
50
- if @@classifier == nil
51
- properties = Properties.new
52
- options.each_pair do |option,value|
53
- #properties.set_property('trainFile', )... Set the options.
54
- end
55
- @@classifier = CRFBiasedClassifier.new(properties)
56
- @@classifier.load_classifier("/ruby/treat/bin/stanford_ner/classifiers/conll.4class.distsim.crf.ser.gz")
57
- end
58
- w = Word.new('Obama')
59
- #puts @@classifier.java_methods
60
- puts CoreAnnotations.public_methods.inspect
61
- puts @@classifier.classify(w).get()
62
-
63
-
64
- /*
65
- * To change this template, choose Tools | Templates
66
- * and open the template in the editor.
67
- */
68
-
69
- package corenlp;
70
- import edu.stanford.nlp.ling.CoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
71
- import edu.stanford.nlp.ling.CoreAnnotations.CorefGraphAnnotation;
72
- import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
73
- import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
74
- import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
75
- import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
76
- import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
77
- import edu.stanford.nlp.ling.CoreAnnotations.TreeAnnotation;
78
- import edu.stanford.nlp.ling.CoreLabel;
79
- import edu.stanford.nlp.pipeline.*;
80
- import edu.stanford.nlp.trees.Tree;
81
- import edu.stanford.nlp.trees.semgraph.SemanticGraph;
82
- import edu.stanford.nlp.util.CoreMap;
83
- import edu.stanford.nlp.util.IntTuple;
84
- import edu.stanford.nlp.util.Pair;
85
- import edu.stanford.nlp.util.Timing;
86
- import java.io.File;
87
- import java.io.FileInputStream;
88
- import java.io.IOException;
89
- import java.util.ArrayList;
90
- import java.util.List;
91
-
92
- import java.util.Properties;
93
- /**
94
- *
95
- * @author Karthi
96
- */
97
- public class Main {
98
-
99
- /**
100
- * @param args the command line arguments
101
- */
102
- public static void main(String[] args) throws IOException, ClassNotFoundException {
103
- // // TODO code application liogic here
104
- // System.out.println(System.getProperty("sun.arch.data.model"));
105
- //// String str="-cp stanford-corenlp-2010-11-12.jar:stanford-corenlp-models-2010-11-06.jar:xom-1.2.6.jar:jgrapht-0.7.3.jar -Xms3g edu.stanford.nlp.pipeline.StanfordCoreNLP -file <input.txt>";
106
- //// args=str.split(" ");
107
- //// StanfordCoreNLP.main(args);
108
- // Timing tim = new Timing();
109
- // Properties props = null;
110
- // props.setProperty("annotators", "ssplit, ner, parse, dcoref");
111
- //
112
- // StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
113
- // props = pipeline.getProperties();
114
- // long setupTime = tim.report();
115
- // String fileName = "input.txt";
116
- // ArrayList<File> files=null;
117
- // files.add(new File(filename));
118
- // pipeline.processFiles(pipeline, files, props);
119
- //
120
- //
121
-
122
-
123
- // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
124
- Properties props = new Properties();
125
- FileInputStream in = new FileInputStream("Main.properties");
126
-
127
- props.load(in);
128
- in.close();
129
- StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
130
-
131
- // read some text in the text variable
132
- String text = "The doctor can consult with other doctors about this patient. If that is the case, the name of the doctor and the names of the consultants have to be maintained. Otherwise, only the name of the doctor is kept. "; // Add your text here!
133
-
134
- // create an empty Annotation just with the given text
135
- Annotation document = new Annotation(text);
136
-
137
- // run all Annotators on this text
138
- pipeline.annotate(document);
139
- System.out.println(document);
140
-
141
- // these are all the sentences in this document
142
- // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
143
- List<CoreMap> sentences = (List<CoreMap>) document.get(SentencesAnnotation.class);
144
- System.out.println(sentences);
145
- for(CoreMap sentence: sentences) {
146
- // traversing the words in the current sentence
147
- // a CoreLabel is a CoreMap with additional token-specific methods
148
- for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
149
- // this is the text of the token
150
- String word = token.get(TextAnnotation.class);
151
- // this is the POS tag of the token
152
- String pos = token.get(PartOfSpeechAnnotation.class);
153
- // this is the NER label of the token
154
- String ne = token.get(NamedEntityTagAnnotation.class);
155
- }
156
-
157
- // this is the parse tree of the current sentence
158
- Tree tree = sentence.get(TreeAnnotation.class);
159
- System.out.println(tree);
160
- // this is the Stanford dependency graph of the current sentence
161
- SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
162
- System.out.println(dependencies);
163
- }
164
-
165
- // this is the coreference link graph
166
- // each link stores an arc in the graph; the first element in the Pair is the source, the second is the target
167
- // each node is stored as <sentence id, token id>. Both offsets start at 1!
168
- List<Pair<IntTuple, IntTuple>> graph = document.get(CorefGraphAnnotation.class);
169
- System.out.println(graph);
170
-
171
- }
172
-
173
- }
174
- =end