opener-ner-base 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8981a1d8d5e60e579ee5fd0e0e8ce4f6b0dbf6c9
4
- data.tar.gz: 56faadbc7debd07516a5f72c0403c2496b5a7b42
3
+ metadata.gz: 184b5a02199988f3fa5e798e9438b679966542e0
4
+ data.tar.gz: e60c01830e0f3e97b1a7489cb4c5acfdc9d3cd14
5
5
  SHA512:
6
- metadata.gz: f943839a5f64c77d71555ca096707481d93242473eb7663cc96a3904991b653a062eb1a7f62cda783e89b2f2824c32d839f95d1a61949f45b4cf9d021466ebf3
7
- data.tar.gz: b2799464a0a2da1af3af91aa8282f5a53925e4dab639becb0deeb6cfb122b5c985d19168da8f08b8622989dcf6c55990eccdb98644934d0fa9213b39eca01c77
6
+ metadata.gz: ca7ed76f8a08c1409ed6de52a2178e766994816ebf92e89821700b0f7cddf0b1e6c0f59e0c69009fbd9877149c84d00dc60efa7ab24be22b403ed4bbb3bbdaec
7
+ data.tar.gz: 7852dd522d262eb94f9482f4a0a81422ba66bd9b762d52a5bf54b79e3192161cd0039b53a2f70f075ece4a93bfb2d59dfd6c763235be188858dbe95dd9141f47
data/README.md CHANGED
@@ -1,5 +1,3 @@
1
- [![Build Status](https://drone.io/github.com/opener-project/ner-base/status.png)](https://drone.io/github.com/opener-project/ner-base/latest)
2
-
3
1
  # NER Base
4
2
 
5
3
  This repository contains the source code used for performing Named Entity
@@ -42,7 +40,14 @@ Using specific install:
42
40
 
43
41
  ## Usage
44
42
 
45
- cat some_input_file.kaf | ner-en
43
+ Basic usage:
44
+
45
+ cat some_input_file.kaf | ner-base
46
+
47
+ This component ships a built-in set of models. If you have your own models you
48
+ can set the environment variable `NER_BASE_MODELS_PATH` to the directory
49
+ containing your models. Each model should be named `LANGUAGE.bin` where
50
+ `LANGUAGE` is a 2 letter language code (`nl`. `en`, etc).
46
51
 
47
52
  ## Contributing
48
53
 
@@ -1,74 +1,36 @@
1
- require 'open3'
2
1
  require 'stringio'
3
- require 'nokogiri'
4
-
5
- require File.expand_path("../../../../core/target/ixa-pipe-nerc-1.1.0.jar", __FILE__)
2
+ require 'oga'
3
+ require 'opener/core'
6
4
 
7
5
  require_relative 'base/version'
8
6
 
7
+ require File.expand_path("../../../../core/target/ixa-pipe-nerc-1.5.2.jar", __FILE__)
8
+
9
9
  module Opener
10
10
  module Ners
11
11
  ##
12
12
  # Base NER class that supports various languages such as Dutch and English.
13
13
  #
14
- # @!attribute [r] options
15
- # @return [Hash]
16
- #
17
- # @!attribute [r] features
18
- # @return [String]
19
- #
20
- # @!attribute [r] beamsize
21
- # @return [Fixnum]
22
- #
23
- # @!attribute [r] dictionaries
24
- # @return [String]
25
- #
26
- # @!attribute [r] dictionaries_path
27
- # @return [String]
28
- #
29
- # @!attribute [r] lexer
30
- # @return [Fixnum]
31
- #
32
- # @!attribute [r] model
33
- # @return [String]
34
- #
35
- # @!attribute [r] enable_time
36
- # @return [TrueClass|FalseClass]
37
- #
38
14
  class Base
39
- attr_reader :features, :beamsize, :dictionaries, :dictionaries_path,
40
- :lexer, :model, :enable_time
15
+ # The default models directory.
16
+ MODELS_PATH = File.expand_path('../../../../models', __FILE__)
17
+
18
+ # @return [String]
19
+ attr_reader :models
20
+
21
+ # @return [TrueClass|FalseClass]
22
+ attr_reader :enable_time
41
23
 
42
24
  ##
43
25
  # @param [Hash] options
44
26
  #
45
- # @option options [String] :features The NERC feature to use, defaults to
46
- # "baseline".
47
- #
48
- # @option options [Fixnum] :beamsize The beam size for decoding, defaults
49
- # to 3.
50
- #
51
- # @option options [String] :dictionaries The dictionary to use, if any.
52
- #
53
- # @option options [String] :dictionaries_path The path to the
54
- # dictionaries.
55
- #
56
- # @option options [Fixnum] :lexer The lexer rules to use for NERC
57
- # tagging.
58
- #
59
- # @option options [String] :model The model to use for NERC annotation.
60
- #
61
27
  # @option options [TrueClass|FalseClass] :enable_time Whether or not to
62
28
  # enable dynamic timestamps (enabled by default).
63
29
  #
64
30
  def initialize(options = {})
65
- @dictionaries = options[:dictionaries]
66
- @dictionaries_path = options[:dictionaries_path]
67
- @features = options.fetch(:features, 'baseline')
68
- @beamsize = options.fetch(:beamsize, 3)
69
- @lexer = options[:lexer]
70
- @model = options.fetch(:model, 'default')
71
- @enable_time = options.fetch(:enable_time, true)
31
+ @models = ENV['NER_BASE_MODELS_PATH'] || MODELS_PATH
32
+
33
+ @enable_time = options.fetch(:enable_time, true)
72
34
  end
73
35
 
74
36
  ##
@@ -79,19 +41,16 @@ module Opener
79
41
  # @return [Array]
80
42
  #
81
43
  def run(input)
82
- lang = language_from_kaf(input)
83
- kaf = new_kaf_document(input)
84
- args = [lang, model, features, beamsize]
44
+ lang = language_from_kaf(input)
45
+ model = File.join(models, "#{lang}.bin")
85
46
 
86
- if use_dictionaries?
87
- args += [dictionaries, dictionaries_path, lexer]
88
- end
47
+ raise(Core::UnsupportedLanguageError, lang) unless File.file?(model)
89
48
 
90
- annotator = Java::es.ehu.si.ixa.pipe.nerc.Annotate.new(*args)
49
+ kaf = new_kaf_document(input)
50
+ properties = build_properties(lang, model)
51
+ annotator = Java::eus.ixa.ixa.pipe.nerc.Annotate.new(properties)
91
52
 
92
53
  annotator.annotate_kaf(enable_time, kaf)
93
-
94
- return kaf.to_string
95
54
  end
96
55
 
97
56
  ##
@@ -102,14 +61,7 @@ module Opener
102
61
  input_io = StringIO.new(input)
103
62
  reader = Java::java.io.InputStreamReader.new(input_io.to_inputstream)
104
63
 
105
- return Java::ixa.kaflib.KAFDocument.create_from_stream(reader)
106
- end
107
-
108
- ##
109
- # @return [TrueClass|FalseClass]
110
- #
111
- def use_dictionaries?
112
- return dictionaries || dictionaries_path || features == 'dict'
64
+ Java::ixa.kaflib.KAFDocument.create_from_stream(reader)
113
65
  end
114
66
 
115
67
  ##
@@ -119,9 +71,39 @@ module Opener
119
71
  # @return [String]
120
72
  #
121
73
  def language_from_kaf(input)
122
- document = Nokogiri::XML(input)
74
+ parser = Oga::XML::PullParser.new(input)
75
+ language = nil
76
+
77
+ parser.parse do |node|
78
+ if node.is_a?(Oga::XML::Element) and node.name == 'KAF'
79
+ language = node.get('xml:lang')
80
+ break
81
+ end
82
+ end
83
+
84
+ # Make sure nobody can _somehow_ inject a language such as "../../foo".
85
+ unless language =~ /\A[a-zA-Z\-_]+\z/
86
+ raise Core::UnsupportedLanguageError, language
87
+ end
88
+
89
+ language
90
+ end
91
+
92
+ private
93
+
94
+ # @param [String] language
95
+ # @param [String] model
96
+ def build_properties(language, model)
97
+ properties = Java::java.util.Properties.new
98
+
99
+ properties.set_property('language', language)
100
+ properties.set_property('model', model)
101
+ properties.set_property('ruleBasedOption', 'off')
102
+ properties.set_property('dictTag', 'off')
103
+ properties.set_property('dictPath', 'off')
104
+ properties.set_property('clearFeatures', 'no')
123
105
 
124
- return document.at('KAF').attr('xml:lang')
106
+ properties
125
107
  end
126
108
  end # Base
127
109
  end # Ners
@@ -1,7 +1,7 @@
1
1
  module Opener
2
2
  module Ners
3
3
  class Base
4
- VERSION = '3.0.1'
4
+ VERSION = '3.1.0'
5
5
  end # Base
6
6
  end # Ners
7
7
  end # Opener
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -12,6 +12,7 @@ Gem::Specification.new do |gem|
12
12
  gem.files = Dir.glob([
13
13
  'core/target/ixa-pipe-nerc-*.jar',
14
14
  'lib/**/*',
15
+ 'models/**/*',
15
16
  '*.gemspec',
16
17
  'README.md',
17
18
  'LICENSE.txt'
@@ -19,7 +20,8 @@ Gem::Specification.new do |gem|
19
20
 
20
21
  gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
21
22
 
22
- gem.add_dependency 'nokogiri'
23
+ gem.add_dependency 'oga'
24
+ gem.add_dependency 'opener-core'
23
25
 
24
26
  gem.add_development_dependency 'rspec', '~> 3.0'
25
27
  gem.add_development_dependency 'cucumber'
metadata CHANGED
@@ -1,85 +1,99 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-ner-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-28 00:00:00.000000000 Z
11
+ date: 2015-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: oga
20
+ prerelease: false
21
+ type: :runtime
15
22
  version_requirements: !ruby/object:Gem::Requirement
16
23
  requirements:
17
- - - '>='
24
+ - - ">="
18
25
  - !ruby/object:Gem::Version
19
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
20
28
  requirement: !ruby/object:Gem::Requirement
21
29
  requirements:
22
- - - '>='
30
+ - - ">="
23
31
  - !ruby/object:Gem::Version
24
32
  version: '0'
33
+ name: opener-core
25
34
  prerelease: false
26
35
  type: :runtime
27
- - !ruby/object:Gem::Dependency
28
- name: rspec
29
36
  version_requirements: !ruby/object:Gem::Requirement
30
37
  requirements:
31
- - - ~>
38
+ - - ">="
32
39
  - !ruby/object:Gem::Version
33
- version: '3.0'
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
34
42
  requirement: !ruby/object:Gem::Requirement
35
43
  requirements:
36
- - - ~>
44
+ - - "~>"
37
45
  - !ruby/object:Gem::Version
38
46
  version: '3.0'
47
+ name: rspec
39
48
  prerelease: false
40
49
  type: :development
41
- - !ruby/object:Gem::Dependency
42
- name: cucumber
43
50
  version_requirements: !ruby/object:Gem::Requirement
44
51
  requirements:
45
- - - '>='
52
+ - - "~>"
46
53
  - !ruby/object:Gem::Version
47
- version: '0'
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
48
56
  requirement: !ruby/object:Gem::Requirement
49
57
  requirements:
50
- - - '>='
58
+ - - ">="
51
59
  - !ruby/object:Gem::Version
52
60
  version: '0'
61
+ name: cucumber
53
62
  prerelease: false
54
63
  type: :development
55
- - !ruby/object:Gem::Dependency
56
- name: rake
57
64
  version_requirements: !ruby/object:Gem::Requirement
58
65
  requirements:
59
- - - '>='
66
+ - - ">="
60
67
  - !ruby/object:Gem::Version
61
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
62
70
  requirement: !ruby/object:Gem::Requirement
63
71
  requirements:
64
- - - '>='
72
+ - - ">="
65
73
  - !ruby/object:Gem::Version
66
74
  version: '0'
75
+ name: rake
67
76
  prerelease: false
68
77
  type: :development
69
- - !ruby/object:Gem::Dependency
70
- name: cliver
71
78
  version_requirements: !ruby/object:Gem::Requirement
72
79
  requirements:
73
- - - '>='
80
+ - - ">="
74
81
  - !ruby/object:Gem::Version
75
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
76
84
  requirement: !ruby/object:Gem::Requirement
77
85
  requirements:
78
- - - '>='
86
+ - - ">="
79
87
  - !ruby/object:Gem::Version
80
88
  version: '0'
89
+ name: cliver
81
90
  prerelease: false
82
91
  type: :development
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  description: Base NER component for languages such as English.
84
98
  email:
85
99
  executables:
@@ -90,9 +104,15 @@ files:
90
104
  - LICENSE.txt
91
105
  - README.md
92
106
  - bin/ner-base
93
- - core/target/ixa-pipe-nerc-1.1.0.jar
107
+ - core/target/ixa-pipe-nerc-1.5.2.jar
94
108
  - lib/opener/ners/base.rb
95
109
  - lib/opener/ners/base/version.rb
110
+ - models/de.bin
111
+ - models/en.bin
112
+ - models/es.bin
113
+ - models/fr.bin
114
+ - models/it.bin
115
+ - models/nl.bin
96
116
  - opener-ner-base.gemspec
97
117
  homepage: http://opener-project.github.com/
98
118
  licenses:
@@ -104,18 +124,19 @@ require_paths:
104
124
  - lib
105
125
  required_ruby_version: !ruby/object:Gem::Requirement
106
126
  requirements:
107
- - - '>='
127
+ - - ">="
108
128
  - !ruby/object:Gem::Version
109
129
  version: '0'
110
130
  required_rubygems_version: !ruby/object:Gem::Requirement
111
131
  requirements:
112
- - - '>='
132
+ - - ">="
113
133
  - !ruby/object:Gem::Version
114
134
  version: '0'
115
135
  requirements: []
116
136
  rubyforge_project:
117
- rubygems_version: 2.2.2
137
+ rubygems_version: 2.4.8
118
138
  signing_key:
119
139
  specification_version: 4
120
140
  summary: Base NER component for languages such as English.
121
141
  test_files: []
142
+ has_rdoc: