opener-ner-base 3.0.1 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8981a1d8d5e60e579ee5fd0e0e8ce4f6b0dbf6c9
4
- data.tar.gz: 56faadbc7debd07516a5f72c0403c2496b5a7b42
3
+ metadata.gz: 184b5a02199988f3fa5e798e9438b679966542e0
4
+ data.tar.gz: e60c01830e0f3e97b1a7489cb4c5acfdc9d3cd14
5
5
  SHA512:
6
- metadata.gz: f943839a5f64c77d71555ca096707481d93242473eb7663cc96a3904991b653a062eb1a7f62cda783e89b2f2824c32d839f95d1a61949f45b4cf9d021466ebf3
7
- data.tar.gz: b2799464a0a2da1af3af91aa8282f5a53925e4dab639becb0deeb6cfb122b5c985d19168da8f08b8622989dcf6c55990eccdb98644934d0fa9213b39eca01c77
6
+ metadata.gz: ca7ed76f8a08c1409ed6de52a2178e766994816ebf92e89821700b0f7cddf0b1e6c0f59e0c69009fbd9877149c84d00dc60efa7ab24be22b403ed4bbb3bbdaec
7
+ data.tar.gz: 7852dd522d262eb94f9482f4a0a81422ba66bd9b762d52a5bf54b79e3192161cd0039b53a2f70f075ece4a93bfb2d59dfd6c763235be188858dbe95dd9141f47
data/README.md CHANGED
@@ -1,5 +1,3 @@
1
- [![Build Status](https://drone.io/github.com/opener-project/ner-base/status.png)](https://drone.io/github.com/opener-project/ner-base/latest)
2
-
3
1
  # NER Base
4
2
 
5
3
  This repository contains the source code used for performing Named Entity
@@ -42,7 +40,14 @@ Using specific install:
42
40
 
43
41
  ## Usage
44
42
 
45
- cat some_input_file.kaf | ner-en
43
+ Basic usage:
44
+
45
+ cat some_input_file.kaf | ner-base
46
+
47
+ This component ships a built-in set of models. If you have your own models you
48
+ can set the environment variable `NER_BASE_MODELS_PATH` to the directory
49
+ containing your models. Each model should be named `LANGUAGE.bin` where
50
+ `LANGUAGE` is a 2 letter language code (`nl`. `en`, etc).
46
51
 
47
52
  ## Contributing
48
53
 
@@ -1,74 +1,36 @@
1
- require 'open3'
2
1
  require 'stringio'
3
- require 'nokogiri'
4
-
5
- require File.expand_path("../../../../core/target/ixa-pipe-nerc-1.1.0.jar", __FILE__)
2
+ require 'oga'
3
+ require 'opener/core'
6
4
 
7
5
  require_relative 'base/version'
8
6
 
7
+ require File.expand_path("../../../../core/target/ixa-pipe-nerc-1.5.2.jar", __FILE__)
8
+
9
9
  module Opener
10
10
  module Ners
11
11
  ##
12
12
  # Base NER class that supports various languages such as Dutch and English.
13
13
  #
14
- # @!attribute [r] options
15
- # @return [Hash]
16
- #
17
- # @!attribute [r] features
18
- # @return [String]
19
- #
20
- # @!attribute [r] beamsize
21
- # @return [Fixnum]
22
- #
23
- # @!attribute [r] dictionaries
24
- # @return [String]
25
- #
26
- # @!attribute [r] dictionaries_path
27
- # @return [String]
28
- #
29
- # @!attribute [r] lexer
30
- # @return [Fixnum]
31
- #
32
- # @!attribute [r] model
33
- # @return [String]
34
- #
35
- # @!attribute [r] enable_time
36
- # @return [TrueClass|FalseClass]
37
- #
38
14
  class Base
39
- attr_reader :features, :beamsize, :dictionaries, :dictionaries_path,
40
- :lexer, :model, :enable_time
15
+ # The default models directory.
16
+ MODELS_PATH = File.expand_path('../../../../models', __FILE__)
17
+
18
+ # @return [String]
19
+ attr_reader :models
20
+
21
+ # @return [TrueClass|FalseClass]
22
+ attr_reader :enable_time
41
23
 
42
24
  ##
43
25
  # @param [Hash] options
44
26
  #
45
- # @option options [String] :features The NERC feature to use, defaults to
46
- # "baseline".
47
- #
48
- # @option options [Fixnum] :beamsize The beam size for decoding, defaults
49
- # to 3.
50
- #
51
- # @option options [String] :dictionaries The dictionary to use, if any.
52
- #
53
- # @option options [String] :dictionaries_path The path to the
54
- # dictionaries.
55
- #
56
- # @option options [Fixnum] :lexer The lexer rules to use for NERC
57
- # tagging.
58
- #
59
- # @option options [String] :model The model to use for NERC annotation.
60
- #
61
27
  # @option options [TrueClass|FalseClass] :enable_time Whether or not to
62
28
  # enable dynamic timestamps (enabled by default).
63
29
  #
64
30
  def initialize(options = {})
65
- @dictionaries = options[:dictionaries]
66
- @dictionaries_path = options[:dictionaries_path]
67
- @features = options.fetch(:features, 'baseline')
68
- @beamsize = options.fetch(:beamsize, 3)
69
- @lexer = options[:lexer]
70
- @model = options.fetch(:model, 'default')
71
- @enable_time = options.fetch(:enable_time, true)
31
+ @models = ENV['NER_BASE_MODELS_PATH'] || MODELS_PATH
32
+
33
+ @enable_time = options.fetch(:enable_time, true)
72
34
  end
73
35
 
74
36
  ##
@@ -79,19 +41,16 @@ module Opener
79
41
  # @return [Array]
80
42
  #
81
43
  def run(input)
82
- lang = language_from_kaf(input)
83
- kaf = new_kaf_document(input)
84
- args = [lang, model, features, beamsize]
44
+ lang = language_from_kaf(input)
45
+ model = File.join(models, "#{lang}.bin")
85
46
 
86
- if use_dictionaries?
87
- args += [dictionaries, dictionaries_path, lexer]
88
- end
47
+ raise(Core::UnsupportedLanguageError, lang) unless File.file?(model)
89
48
 
90
- annotator = Java::es.ehu.si.ixa.pipe.nerc.Annotate.new(*args)
49
+ kaf = new_kaf_document(input)
50
+ properties = build_properties(lang, model)
51
+ annotator = Java::eus.ixa.ixa.pipe.nerc.Annotate.new(properties)
91
52
 
92
53
  annotator.annotate_kaf(enable_time, kaf)
93
-
94
- return kaf.to_string
95
54
  end
96
55
 
97
56
  ##
@@ -102,14 +61,7 @@ module Opener
102
61
  input_io = StringIO.new(input)
103
62
  reader = Java::java.io.InputStreamReader.new(input_io.to_inputstream)
104
63
 
105
- return Java::ixa.kaflib.KAFDocument.create_from_stream(reader)
106
- end
107
-
108
- ##
109
- # @return [TrueClass|FalseClass]
110
- #
111
- def use_dictionaries?
112
- return dictionaries || dictionaries_path || features == 'dict'
64
+ Java::ixa.kaflib.KAFDocument.create_from_stream(reader)
113
65
  end
114
66
 
115
67
  ##
@@ -119,9 +71,39 @@ module Opener
119
71
  # @return [String]
120
72
  #
121
73
  def language_from_kaf(input)
122
- document = Nokogiri::XML(input)
74
+ parser = Oga::XML::PullParser.new(input)
75
+ language = nil
76
+
77
+ parser.parse do |node|
78
+ if node.is_a?(Oga::XML::Element) and node.name == 'KAF'
79
+ language = node.get('xml:lang')
80
+ break
81
+ end
82
+ end
83
+
84
+ # Make sure nobody can _somehow_ inject a language such as "../../foo".
85
+ unless language =~ /\A[a-zA-Z\-_]+\z/
86
+ raise Core::UnsupportedLanguageError, language
87
+ end
88
+
89
+ language
90
+ end
91
+
92
+ private
93
+
94
+ # @param [String] language
95
+ # @param [String] model
96
+ def build_properties(language, model)
97
+ properties = Java::java.util.Properties.new
98
+
99
+ properties.set_property('language', language)
100
+ properties.set_property('model', model)
101
+ properties.set_property('ruleBasedOption', 'off')
102
+ properties.set_property('dictTag', 'off')
103
+ properties.set_property('dictPath', 'off')
104
+ properties.set_property('clearFeatures', 'no')
123
105
 
124
- return document.at('KAF').attr('xml:lang')
106
+ properties
125
107
  end
126
108
  end # Base
127
109
  end # Ners
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  module Ners
3
3
  class Base
4
- VERSION = '3.0.1'
4
+ VERSION = '3.1.0'
5
5
  end # Base
6
6
  end # Ners
7
7
  end # Opener
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -12,6 +12,7 @@ Gem::Specification.new do |gem|
12
12
  gem.files = Dir.glob([
13
13
  'core/target/ixa-pipe-nerc-*.jar',
14
14
  'lib/**/*',
15
+ 'models/**/*',
15
16
  '*.gemspec',
16
17
  'README.md',
17
18
  'LICENSE.txt'
@@ -19,7 +20,8 @@ Gem::Specification.new do |gem|
19
20
 
20
21
  gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
21
22
 
22
- gem.add_dependency 'nokogiri'
23
+ gem.add_dependency 'oga'
24
+ gem.add_dependency 'opener-core'
23
25
 
24
26
  gem.add_development_dependency 'rspec', '~> 3.0'
25
27
  gem.add_development_dependency 'cucumber'
metadata CHANGED
@@ -1,85 +1,99 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-ner-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-28 00:00:00.000000000 Z
11
+ date: 2015-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: oga
20
+ prerelease: false
21
+ type: :runtime
15
22
  version_requirements: !ruby/object:Gem::Requirement
16
23
  requirements:
17
- - - '>='
24
+ - - ">="
18
25
  - !ruby/object:Gem::Version
19
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
20
28
  requirement: !ruby/object:Gem::Requirement
21
29
  requirements:
22
- - - '>='
30
+ - - ">="
23
31
  - !ruby/object:Gem::Version
24
32
  version: '0'
33
+ name: opener-core
25
34
  prerelease: false
26
35
  type: :runtime
27
- - !ruby/object:Gem::Dependency
28
- name: rspec
29
36
  version_requirements: !ruby/object:Gem::Requirement
30
37
  requirements:
31
- - - ~>
38
+ - - ">="
32
39
  - !ruby/object:Gem::Version
33
- version: '3.0'
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
34
42
  requirement: !ruby/object:Gem::Requirement
35
43
  requirements:
36
- - - ~>
44
+ - - "~>"
37
45
  - !ruby/object:Gem::Version
38
46
  version: '3.0'
47
+ name: rspec
39
48
  prerelease: false
40
49
  type: :development
41
- - !ruby/object:Gem::Dependency
42
- name: cucumber
43
50
  version_requirements: !ruby/object:Gem::Requirement
44
51
  requirements:
45
- - - '>='
52
+ - - "~>"
46
53
  - !ruby/object:Gem::Version
47
- version: '0'
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
48
56
  requirement: !ruby/object:Gem::Requirement
49
57
  requirements:
50
- - - '>='
58
+ - - ">="
51
59
  - !ruby/object:Gem::Version
52
60
  version: '0'
61
+ name: cucumber
53
62
  prerelease: false
54
63
  type: :development
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
64
  version_requirements: !ruby/object:Gem::Requirement
58
65
  requirements:
59
- - - '>='
66
+ - - ">="
60
67
  - !ruby/object:Gem::Version
61
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
62
70
  requirement: !ruby/object:Gem::Requirement
63
71
  requirements:
64
- - - '>='
72
+ - - ">="
65
73
  - !ruby/object:Gem::Version
66
74
  version: '0'
75
+ name: rake
67
76
  prerelease: false
68
77
  type: :development
69
- - !ruby/object:Gem::Dependency
70
- name: cliver
71
78
  version_requirements: !ruby/object:Gem::Requirement
72
79
  requirements:
73
- - - '>='
80
+ - - ">="
74
81
  - !ruby/object:Gem::Version
75
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
76
84
  requirement: !ruby/object:Gem::Requirement
77
85
  requirements:
78
- - - '>='
86
+ - - ">="
79
87
  - !ruby/object:Gem::Version
80
88
  version: '0'
89
+ name: cliver
81
90
  prerelease: false
82
91
  type: :development
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  description: Base NER component for languages such as English.
84
98
  email:
85
99
  executables:
@@ -90,9 +104,15 @@ files:
90
104
  - LICENSE.txt
91
105
  - README.md
92
106
  - bin/ner-base
93
- - core/target/ixa-pipe-nerc-1.1.0.jar
107
+ - core/target/ixa-pipe-nerc-1.5.2.jar
94
108
  - lib/opener/ners/base.rb
95
109
  - lib/opener/ners/base/version.rb
110
+ - models/de.bin
111
+ - models/en.bin
112
+ - models/es.bin
113
+ - models/fr.bin
114
+ - models/it.bin
115
+ - models/nl.bin
96
116
  - opener-ner-base.gemspec
97
117
  homepage: http://opener-project.github.com/
98
118
  licenses:
@@ -104,18 +124,19 @@ require_paths:
104
124
  - lib
105
125
  required_ruby_version: !ruby/object:Gem::Requirement
106
126
  requirements:
107
- - - '>='
127
+ - - ">="
108
128
  - !ruby/object:Gem::Version
109
129
  version: '0'
110
130
  required_rubygems_version: !ruby/object:Gem::Requirement
111
131
  requirements:
112
- - - '>='
132
+ - - ">="
113
133
  - !ruby/object:Gem::Version
114
134
  version: '0'
115
135
  requirements: []
116
136
  rubyforge_project:
117
- rubygems_version: 2.2.2
137
+ rubygems_version: 2.4.8
118
138
  signing_key:
119
139
  specification_version: 4
120
140
  summary: Base NER component for languages such as English.
121
141
  test_files: []
142
+ has_rdoc: