mitie 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 519a88b20911f72d0a66ceaca52e1af3171bab683e27392763f52ac4c494d185
4
- data.tar.gz: 6c955aa66776ef1ec92ccc151ba6fce1eeef8d968fad2ac8fd47cb0a7ea4f3bf
3
+ metadata.gz: ca692cd09ac5c4541998bf601b13e95a041ac29e5d3fffcdf1e00f4b1a5fee58
4
+ data.tar.gz: dec98a81c3e45b2e6f191883e4a331bbfc0a98d9da7630c9ee806addb2284704
5
5
  SHA512:
6
- metadata.gz: 1d8d373478c4ae69844959a349598c35f4187ff91e52e5a6be457b5e61769b0c109bb143b4604999665d6a1fce532ad027add26fc3ea03360764bd2025357c91
7
- data.tar.gz: b4c13d770bfb8b03108d6a93c757f286d7dc3e0f157ea5b2db94b7028994df632af40bdbd19f1fe7a40a96d4e3b5fa32d28f724df407c6a901f1bb79635cd046
6
+ metadata.gz: f802b2a582dc0362351f7c26441e482bc064affa03828e033a948c26a9b6a3bdb42ad216768510b507bd8be6dc0692a34a8b1a0125cfa9533e7710d5ee2c1cba
7
+ data.tar.gz: 0bf700445e710a16a871fff6d7089aa44a3a8a53420e7f9e5a7a8cf959ccf6066a5e7b59bd23a7631b2c378571e34f5cfa4d5158a6b599d91bc1caa3560b79c9
@@ -1,3 +1,7 @@
1
+ ## 0.1.3 (2020-12-04)
2
+
3
+ - Added support for custom tokenization
4
+
1
5
  ## 0.1.2 (2020-09-14)
2
6
 
3
7
  - Added binary relation detection
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  - Finds people, organizations, and locations in text
6
6
  - Detects relationships between entities, like `PERSON` was born in `LOCATION`
7
7
 
8
- [![Build Status](https://travis-ci.org/ankane/mitie.svg?branch=master)](https://travis-ci.org/ankane/mitie) [![Build status](https://ci.appveyor.com/api/projects/status/stc89tc57xfva451/branch/master?svg=true)](https://ci.appveyor.com/project/ankane/mitie/branch/master)
8
+ [![Build Status](https://github.com/ankane/mitie/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mitie/actions)
9
9
 
10
10
  ## Installation
11
11
 
@@ -32,7 +32,7 @@ model = Mitie::NER.new("ner_model.dat")
32
32
  Create a document
33
33
 
34
34
  ```ruby
35
- doc = model.doc("Nat Friedman is the CEO of GitHub, which is headquartered in San Francisco")
35
+ doc = model.doc("Nat works at GitHub in San Francisco")
36
36
  ```
37
37
 
38
38
  Get entities
@@ -45,9 +45,9 @@ This returns
45
45
 
46
46
  ```ruby
47
47
  [
48
- {text: "Nat Friedman", tag: "PERSON", score: 1.099661347535191, offset: 0},
49
- {text: "GitHub", tag: "ORGANIZATION", score: 0.344641651251650, offset: 27},
50
- {text: "San Francisco", tag: "LOCATION", score: 1.428241888939011, offset: 61}
48
+ {text: "Nat", tag: "PERSON", score: 0.3112371212688382, offset: 0},
49
+ {text: "GitHub", tag: "ORGANIZATION", score: 0.5660115198329334, offset: 13},
50
+ {text: "San Francisco", tag: "LOCATION", score: 1.3890524313885309, offset: 23}
51
51
  ]
52
52
  ```
53
53
 
@@ -82,13 +82,13 @@ There are 21 detectors for English. You can find them in the `binary_relations`
82
82
  Load a detector
83
83
 
84
84
  ```ruby
85
- detector = Mitie::BinaryRelationDetector.new("rel_classifier_film.film.directed_by.svm")
85
+ detector = Mitie::BinaryRelationDetector.new("rel_classifier_organization.organization.place_founded.svm")
86
86
  ```
87
87
 
88
88
  And create a document
89
89
 
90
90
  ```ruby
91
- doc = model.doc("The Shawshank Redemption was directed by Frank Darabont")
91
+ doc = model.doc("Shopify was founded in Ottawa")
92
92
  ```
93
93
 
94
94
  Get relations
@@ -100,7 +100,7 @@ detector.relations(doc)
100
100
  This returns
101
101
 
102
102
  ```ruby
103
- [{first: "Shawshank Redemption", second: "Frank Darabont", score: 1.124211742912441}]
103
+ [{first: "Shopify", second: "Ottawa", score: 0.17649169745814464}]
104
104
  ```
105
105
 
106
106
  ## History
@@ -4,7 +4,7 @@ module Mitie
4
4
 
5
5
  def initialize(model, text)
6
6
  @model = model
7
- @text = text.to_s
7
+ @text = text
8
8
  end
9
9
 
10
10
  def tokens
@@ -13,16 +13,21 @@ module Mitie
13
13
 
14
14
  def tokens_with_offset
15
15
  @tokens_with_offset ||= begin
16
- i = 0
17
- tokens = []
18
- loop do
19
- token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
20
- break if token.null?
21
- offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
22
- tokens << [token.to_s.force_encoding(text.encoding), offset]
23
- i += 1
16
+ if text.is_a?(Array)
17
+ # offsets are unknown when given tokens
18
+ text.map { |v| [v, nil] }
19
+ else
20
+ i = 0
21
+ tokens = []
22
+ loop do
23
+ token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
24
+ break if token.null?
25
+ offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
26
+ tokens << [token.to_s.force_encoding(text.encoding), offset]
27
+ i += 1
28
+ end
29
+ tokens
24
30
  end
25
- tokens
26
31
  end
27
32
  end
28
33
 
@@ -40,15 +45,20 @@ module Mitie
40
45
  score = FFI.mitie_ner_get_detection_score(detections, i)
41
46
  tok = tokens[pos, len]
42
47
  offset = tok[0][1]
43
- finish = tok[-1][1] + tok[-1][0].size
44
- entities << {
45
- text: text[offset...finish],
46
- tag: tag,
47
- score: score,
48
- offset: offset,
49
- token_index: pos,
50
- token_length: len
51
- }
48
+
49
+ entity = {}
50
+ if offset
51
+ finish = tok[-1][1] + tok[-1][0].size
52
+ entity[:text] = text[offset...finish]
53
+ else
54
+ entity[:text] = tok.map(&:first)
55
+ end
56
+ entity[:tag] = tag
57
+ entity[:score] = score
58
+ entity[:offset] = offset if offset
59
+ entity[:token_index] = pos
60
+ entity[:token_length] = len
61
+ entities << entity
52
62
  end
53
63
  entities
54
64
  ensure
@@ -73,13 +83,22 @@ module Mitie
73
83
 
74
84
  def tokenize
75
85
  @tokenize ||= begin
76
- offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
77
- tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
86
+ if text.is_a?(Array)
87
+ # malloc uses memset to set all bytes to 0
88
+ tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
89
+ text.size.times do |i|
90
+ tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
91
+ end
92
+ [tokens_ptr, nil]
93
+ else
94
+ offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
95
+ tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
78
96
 
79
- ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
80
- ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
97
+ ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
98
+ ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
81
99
 
82
- [tokens_ptr, offsets_ptr]
100
+ [tokens_ptr, offsets_ptr]
101
+ end
83
102
  end
84
103
  end
85
104
 
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-15 00:00:00.000000000 Z
11
+ date: 2020-12-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,7 +52,7 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5'
55
- description:
55
+ description:
56
56
  email: andrew@chartkick.com
57
57
  executables: []
58
58
  extensions: []
@@ -75,7 +75,7 @@ homepage: https://github.com/ankane/mitie
75
75
  licenses:
76
76
  - BSL-1.0
77
77
  metadata: {}
78
- post_install_message:
78
+ post_install_message:
79
79
  rdoc_options: []
80
80
  require_paths:
81
81
  - lib
@@ -90,8 +90,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
90
  - !ruby/object:Gem::Version
91
91
  version: '0'
92
92
  requirements: []
93
- rubygems_version: 3.1.2
94
- signing_key:
93
+ rubygems_version: 3.1.4
94
+ signing_key:
95
95
  specification_version: 4
96
96
  summary: Named-entity recognition for Ruby
97
97
  test_files: []