stanford-core-nlp 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ Alabama Ala. AL
2
+ Alaska Alaska AK
3
+ Arizona Ariz. AZ
4
+ Arkansas Ark. AR
5
+ California Calif. CA
6
+ Colorado Colo. CO
7
+ Connecticut Conn. CT
8
+ Delaware Del. DE
9
+ Florida Fla. FL
10
+ Georgia Ga. GA
11
+ Hawaii Hawaii HI
12
+ Idaho Idaho ID
13
+ Illinois Ill. IL
14
+ Indiana Ind. IN
15
+ Iowa Iowa IA
16
+ Kansas Kans. KS
17
+ Kentucky Ky. KY
18
+ Louisiana La. LA
19
+ Maine Maine ME
20
+ Maryland Md. MD
21
+ Massachusetts Mass. MA
22
+ Michigan Mich. MI
23
+ Minnesota Minn. MN
24
+ Mississippi Miss. MS
25
+ Missouri Mo. MO
26
+ Montana Mont. MT
27
+ Nebraska Nebr. NE
28
+ Nevada Nev. NV
29
+ New Hampshire N.H. NH
30
+ New Jersey N.J. NJ
31
+ New Mexico N.M. NM
32
+ New York N.Y. NY
33
+ North Carolina N.C. NC
34
+ North Dakota N.D. ND
35
+ Ohio Ohio OH
36
+ Oklahoma Okla. OK
37
+ Oregon Ore. OR
38
+ Pennsylvania Pa. PA
39
+ Rhode Island R.I. RI
40
+ South Carolina S.C. SC
41
+ South Dakota S.D. SD
42
+ Tennessee Tenn. TN
43
+ Texas Tex. TX
44
+ Utah Utah UT
45
+ Vermont Vt. VT
46
+ Virginia Va. VA
47
+ Washington Wash. WA
48
+ West Virginia W.Va. WV
49
+ Wisconsin Wis. WI
50
+ Wyoming Wyo. WY
File without changes
Binary file
Binary file
@@ -0,0 +1,102 @@
1
+ Stanford POS Tagger, v. 3.1.0 - 2011-12-16
2
+ Copyright (c) 2002-2011 The Board of Trustees of
3
+ The Leland Stanford Junior University. All Rights Reserved.
4
+
5
+ This document contains (some) information about the models included in
6
+ this release and that may be downloaded for the POS tagger website at
7
+ http://nlp.stanford.edu/software/tagger.shtml . If you have downloaded
8
+ the full tagger, all of the models mentioned in this document are in the
9
+ downloaded package in the same directory as this readme. Otherwise,
10
+ included in the download are two
11
+ English taggers, and the other taggers may be downloaded from the
12
+ website. All taggers are accompanied by the props files used to create
13
+ them; please examine these files for more detailed information about the
14
+ creation of the taggers.
15
+
16
+ For English, the bidirectional taggers are slightly more accurate, but
17
+ tag much more slowly; choose the appropriate tagger based on your
18
+ speed/performance needs.
19
+
20
+ English taggers
21
+ ---------------------------
22
+ bidirectional-distsim-wsj-0-18.tagger
23
+ Trained on WSJ sections 0-18 using a bidirectional architecture and
24
+ including word shape and distributional similarity features.
25
+ Penn Treebank tagset.
26
+ Performance:
27
+ 97.28% correct on WSJ 19-21
28
+ (90.46% correct on unknown words)
29
+
30
+ left3words-wsj-0-18.tagger
31
+ Trained on WSJ sections 0-18 using the left3words architecture and
32
+ includes word shape features. Penn tagset.
33
+ Performance:
34
+ 96.97% correct on WSJ 19-21
35
+ (88.85% correct on unknown words)
36
+
37
+ left3words-distsim-wsj-0-18.tagger
38
+ Trained on WSJ sections 0-18 using the left3words architecture and
39
+ includes word shape and distributional similarity features. Penn tagset.
40
+ Performance:
41
+ 97.01% correct on WSJ 19-21
42
+ (89.81% correct on unknown words)
43
+
44
+
45
+ Chinese tagger
46
+ ---------------------------
47
+ chinese.tagger
48
+ Trained on a combination of Chinese Treebank texts from Chinese and Hong
49
+ Kong sources.
50
+ LDC Chinese Treebank POS tag set.
51
+ Performance:
52
+ 94.13% on a combination of Chinese and Hong Kong texts
53
+ (78.92% on unknown words)
54
+
55
+ Arabic tagger
56
+ ---------------------------
57
+ arabic-accurate.tagger
58
+ Trained on the *entire* ATB p1-3.
59
+ When trained on the train part of the ATB p1-3 split done for the 2005
60
+ JHU Summer Workshop (Diab split), using (augmented) Bies tags, it gets
61
+ the following performance:
62
+ Performance:
63
+ 96.50% on dev portion according to Diab split
64
+ (80.59% on unknown words)
65
+
66
+ arabic-fast.tagger
67
+ 4x speed improvement over "accurate".
68
+ Performance:
69
+ 96.34% on dev portion according to Diab split
70
+ (80.28% on unknown words)
71
+
72
+
73
+ French tagger
74
+ ---------------------------
75
+ french.tagger
76
+ Trained on the French treebank.
77
+
78
+ German tagger
79
+ ---------------------------
80
+ german-hgc.tagger
81
+ Trained on the first 80% of the Negra corpus, which uses the STTS tagset.
82
+ The Stuttgart-Tübingen Tagset (STTS) is a set of 54 tags for annotating
83
+ German text corpora with part-of-speech labels, which was jointly
84
+ developed by the Institut für maschinelle Sprachverarbeitung of the
85
+ University of Stuttgart and the Seminar für Sprachwissenschaft of the
86
+ University of Tübingen. See:
87
+ http://www.ims.uni-stuttgart.de/projekte/CQPDemos/Bundestag/help-tagset.html
88
+ This model uses features from the distributional similarity clusters
89
+ built over the HGC.
90
+ Performance:
91
+ 96.90% on the first half of the remaining 20% of the Negra corpus (dev set)
92
+ (90.33% on unknown words)
93
+
94
+ german-dewac.tagger
95
+ This model uses features from the distributional similarity clusters
96
+ built from the deWac web corpus.
97
+
98
+ german-fast.tagger
99
+ Lacks distributional similarity features, but is several times faster
100
+ than the other alternatives.
101
+ Performance:
102
+ 96.61% overall / 86.72% unknown.
@@ -0,0 +1,33 @@
1
+ ## tagger training invoked at Thu Dec 15 01:17:19 PST 2011 with arguments:
2
+ model = english-bidirectional-distsim.tagger
3
+ arch = bidirectional5words,naacl2003unknowns,allwordshapes(-1,1),distsim(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),distsimconjunction(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1)
4
+ trainFile = /u/nlp/data/pos-tagger/english/train-wsj-0-18;/u/nlp/data/pos-tagger/english/train-extra-english
5
+ closedClassTags =
6
+ closedClassTagThreshold = 40
7
+ curWordMinFeatureThresh = 2
8
+ debug = false
9
+ debugPrefix =
10
+ tagSeparator = _
11
+ encoding = UTF-8
12
+ iterations = 100
13
+ lang = english
14
+ learnClosedClassTags = false
15
+ minFeatureThresh = 2
16
+ openClassTags =
17
+ rareWordMinFeatureThresh = 5
18
+ rareWordThresh = 5
19
+ search = owlqn
20
+ sgml = false
21
+ sigmaSquared = 0.5
22
+ regL1 = 0.75
23
+ tagInside =
24
+ tokenize = true
25
+ tokenizerFactory =
26
+ tokenizerOptions =
27
+ verbose = false
28
+ verboseResults = true
29
+ veryCommonWordThresh = 250
30
+ xmlInput =
31
+ outputFile =
32
+ outputFormat = slashTags
33
+ outputFormatOptions =
@@ -0,0 +1,33 @@
1
+ ## tagger training invoked at Thu Dec 15 01:17:21 PST 2011 with arguments:
2
+ model = english-left3words-distsim.tagger
3
+ arch = left3words,naacl2003unknowns,wordshapes(-1,1),distsim(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),distsimconjunction(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1)
4
+ trainFile = /u/nlp/data/pos-tagger/english/train-wsj-0-18;/u/nlp/data/pos-tagger/english/train-extra-english
5
+ closedClassTags =
6
+ closedClassTagThreshold = 40
7
+ curWordMinFeatureThresh = 2
8
+ debug = false
9
+ debugPrefix =
10
+ tagSeparator = _
11
+ encoding = UTF-8
12
+ iterations = 100
13
+ lang = english
14
+ learnClosedClassTags = false
15
+ minFeatureThresh = 2
16
+ openClassTags =
17
+ rareWordMinFeatureThresh = 10
18
+ rareWordThresh = 5
19
+ search = owlqn
20
+ sgml = false
21
+ sigmaSquared = 0.0
22
+ regL1 = 0.75
23
+ tagInside =
24
+ tokenize = true
25
+ tokenizerFactory =
26
+ tokenizerOptions =
27
+ verbose = false
28
+ verboseResults = true
29
+ veryCommonWordThresh = 250
30
+ xmlInput =
31
+ outputFile =
32
+ outputFormat = slashTags
33
+ outputFormatOptions =
Binary file
@@ -0,0 +1,106 @@
1
+ module StanfordCoreNLP
2
+
3
+ VERSION = '0.1'
4
+ require 'stanford-core-nlp/jar-loader.rb'
5
+
6
+ class << self
7
+ # The path in which to look for the Stanford JAR files.
8
+ # This is passed to JarLoader.
9
+ attr_accessor :jar_path
10
+ # The flags for starting the JVM machine.
11
+ # Parser and named entity recognizer are very memory consuming.
12
+ attr_accessor :jvm_flags
13
+ end
14
+
15
+ self.jar_path = File.dirname(__FILE__) + '/../bin/'
16
+ self.jvm_flags = ['-Xms512M', '-Xmx1024M']
17
+
18
+ # Return the default properties (English models with
19
+ # tokenizer and sentence splitter).
20
+ def self.default_properties
21
+ {
22
+ 'annotators' => 'tokenize, ssplit',
23
+ 'pos.model' => self.jar_path + 'taggers/english-left3words-distsim.tagger',
24
+ 'ner.model.3class' => self.jar_path + 'classifiers/all.3class.distsim.crf.ser.gz',
25
+ 'ner.model.7class' => self.jar_path + 'classifiers/muc.7class.distsim.crf.ser.gz',
26
+ 'ner.model.MISCclass' => self.jar_path + 'classifiers/conll.4class.distsim.crf.ser.gz',
27
+ 'parser.model' => self.jar_path + 'grammar/englishPCFG.ser.gz',
28
+ 'dcoref.demonym' => self.jar_path + 'dcoref/demonyms.txt',
29
+ 'dcoref.animate' => self.jar_path + 'dcoref/animate.unigrams.txt',
30
+ 'dcoref.female' => self.jar_path + 'dcoref/female.unigrams.txt',
31
+ 'dcoref.inanimate' => self.jar_path + 'dcoref/inanimate.unigrams.txt',
32
+ 'dcoref.male' => self.jar_path + 'dcoref/male.unigrams.txt',
33
+ 'dcoref.neutral' => self.jar_path + 'dcoref/neutral.unigrams.txt',
34
+ 'dcoref.plural' => self.jar_path + 'dcoref/plural.unigrams.txt',
35
+ 'dcoref.singular' => self.jar_path + 'dcoref/singular.unigrams.txt',
36
+ 'dcoref.states' => self.jar_path + 'dcoref/state-abbreviations.txt',
37
+ 'dcoref.countries' => self.jar_path + 'dcoref/unknown.txt', # Fix - can somebody provide this file?
38
+ 'dcoref.states.provinces' => self.jar_path + 'dcoref/unknown.txt', # Fix - can somebody provide this file?
39
+ 'dcoref.extra.gender' => self.jar_path + 'dcoref/namegender.combine.txt'
40
+ }
41
+ end
42
+
43
+ # Load a StanfordCoreNLP pipeline with the specified JVM flags and
44
+ # StanfordCoreNLP properties (hash of property => values).
45
+ def self.load(properties)
46
+ self.load_jars(jvm_flags, self.jar_path)
47
+ self.create_classes
48
+ properties = default_properties.merge(properties)
49
+ CoreNLP.new(get_properties(properties))
50
+ end
51
+
52
+ # Load the jars.
53
+ def self.load_jars(jvm_flags, jar_path)
54
+ JarLoader.jvm_flags = jvm_flags
55
+ JarLoader.jar_path = jar_path
56
+ JarLoader.load('joda-time.jar')
57
+ JarLoader.load('xom.jar')
58
+ JarLoader.load('stanford-corenlp.jar')
59
+ JarLoader.load('bridge.jar')
60
+ end
61
+
62
+ # Create the Ruby classes for core classes.
63
+ def self.create_classes
64
+ const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
65
+ const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
66
+ const_set(:Text, Annotation) # A more intuitive alias.
67
+ const_set(:Properties, Rjb::import('java.util.Properties'))
68
+ const_set(:AnnotationBridge, Rjb::import('AnnotationBridge'))
69
+ end
70
+
71
+ # Create a java.util.Properties object from a hash.
72
+ def self.get_properties(properties)
73
+ props = Properties.new
74
+ properties.each do |property, value|
75
+ props.set_property(property, value)
76
+ end
77
+ props
78
+ end
79
+
80
+ Rjb::Rjb_JavaProxy.class_eval do
81
+
82
+ # Get an annotation using the annotation bridge.
83
+ def get(annotation)
84
+ base_class = (annotation.to_s.split('_')[0] == 'coref') ?
85
+ 'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
86
+ 'edu.stanford.nlp.ling.CoreAnnotations$'
87
+ anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
88
+ url = "#{base_class}#{anno_class}Annotation"
89
+ AnnotationBridge.getAnnotation(self, url)
90
+ end
91
+
92
+ # Shorthand for to_string defined by Java classes.
93
+ def to_s; to_string; end
94
+
95
+ # Provide Ruby-style iterators to wrap Java iterators.
96
+ def each
97
+ if !java_methods.include?('iterator()')
98
+ raise 'This object cannot be iterated.'
99
+ else
100
+ i = self.iterator
101
+ while i.has_next; yield i.next;end
102
+ end
103
+ end
104
+ end
105
+
106
+ end
@@ -0,0 +1,61 @@
1
+ module StanfordCoreNLP
2
+
3
+ class JarLoader
4
+
5
+ require 'rjb'
6
+
7
+ # Configuration options.
8
+ class << self
9
+ # An array of flags to pass to the JVM machine.
10
+ attr_accessor :jvm_flags
11
+ attr_accessor :jar_path
12
+ attr_accessor :log_file
13
+ end
14
+
15
+ # An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
16
+ self.jvm_flags = []
17
+ # The path in which to look for Jars.
18
+ self.jar_path = ''
19
+ # The name of the file to log to.
20
+ # Setting this before the parser automatically calls self.redirect_to_log
21
+ self.log_file = nil
22
+
23
+ # Load Rjb and create Java VM.
24
+ def self.rjb_initialize
25
+ return if ::Rjb::loaded?
26
+ ::Rjb::load(nil, self.jvm_flags)
27
+ redirect_to_log if self.log_file
28
+ end
29
+
30
+ # Redirect the output of the JVM to self.log_file.
31
+ def self.redirect_to_log
32
+ const_set(:System, Rjb::import('java.lang.System'))
33
+ const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
34
+ const_set(:File2, Rjb::import('java.io.File'))
35
+ ps = PrintStream.new(File2.new(self.log_file))
36
+ ps.write(::Time.now.strftime("[%m/%d/%Y at %I:%M%p]\n\n"))
37
+ System.setOut(ps)
38
+ System.setErr(ps)
39
+ end
40
+
41
+ # Load a jar.
42
+ def self.load(jar)
43
+ self.rjb_initialize
44
+ jar = self.jar_path + jar
45
+ if !::File.readable?(jar)
46
+ raise "Could not find JAR file (looking in #{jar})."
47
+ end
48
+ ::Rjb::add_jar(jar)
49
+ end
50
+
51
+ # Silence output and log to file.
52
+ def self.log(file = 'log.txt')
53
+ @@log_file = file
54
+ end
55
+
56
+ # Whether the output is logged or not.
57
+ def self.log?; @@log_file; end
58
+
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: stanford-core-nlp
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Louis Mullie
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rjb
16
+ requirement: &70234870930100 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70234870930100
25
+ description: ! ' High-level Ruby bindings to the Stanford CoreNLP package, a set natural
26
+ language processing tools for English, including tokenization, part-of-speech tagging,
27
+ lemmatization, named entity recognition, parsing, and coreference resolution. '
28
+ email:
29
+ - louis.mullie@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/stanford-core-nlp/jar-loader.rb
35
+ - lib/stanford-core-nlp.rb
36
+ - bin/bridge.jar
37
+ - bin/classifiers/all.3class.distsim.crf.ser.gz
38
+ - bin/classifiers/all.3class.distsim.prop
39
+ - bin/classifiers/conll.4class.distsim.crf.ser.gz
40
+ - bin/classifiers/conll.4class.distsim.prop
41
+ - bin/classifiers/muc.7class.distsim.crf.ser.gz
42
+ - bin/classifiers/muc.7class.distsim.prop
43
+ - bin/dcoref/animate.unigrams.txt
44
+ - bin/dcoref/demonyms.txt
45
+ - bin/dcoref/female.unigrams.txt
46
+ - bin/dcoref/inanimate.unigrams.txt
47
+ - bin/dcoref/male.unigrams.txt
48
+ - bin/dcoref/namegender.combine.txt
49
+ - bin/dcoref/neutral.unigrams.txt
50
+ - bin/dcoref/plural.unigrams.txt
51
+ - bin/dcoref/singular.unigrams.txt
52
+ - bin/dcoref/state-abbreviations.txt
53
+ - bin/dcoref/unknown.txt
54
+ - bin/grammar/englishFactored.ser.gz
55
+ - bin/grammar/englishPCFG.ser.gz
56
+ - bin/joda-time.jar
57
+ - bin/stanford-corenlp.jar
58
+ - bin/taggers/english-bidirectional-distsim.tagger
59
+ - bin/taggers/english-bidirectional-distsim.tagger.props
60
+ - bin/taggers/english-left3words-distsim.tagger
61
+ - bin/taggers/english-left3words-distsim.tagger.props
62
+ - bin/taggers/README-Models.txt
63
+ - bin/xom.jar
64
+ - README
65
+ - LICENSE
66
+ homepage: https://github.com/louismullie/stanford-core-nlp
67
+ licenses: []
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ! '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 1.8.15
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: Ruby bindings to the Stanford CoreNLP tools.
90
+ test_files: []