mitie 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 519a88b20911f72d0a66ceaca52e1af3171bab683e27392763f52ac4c494d185
4
- data.tar.gz: 6c955aa66776ef1ec92ccc151ba6fce1eeef8d968fad2ac8fd47cb0a7ea4f3bf
3
+ metadata.gz: ca692cd09ac5c4541998bf601b13e95a041ac29e5d3fffcdf1e00f4b1a5fee58
4
+ data.tar.gz: dec98a81c3e45b2e6f191883e4a331bbfc0a98d9da7630c9ee806addb2284704
5
5
  SHA512:
6
- metadata.gz: 1d8d373478c4ae69844959a349598c35f4187ff91e52e5a6be457b5e61769b0c109bb143b4604999665d6a1fce532ad027add26fc3ea03360764bd2025357c91
7
- data.tar.gz: b4c13d770bfb8b03108d6a93c757f286d7dc3e0f157ea5b2db94b7028994df632af40bdbd19f1fe7a40a96d4e3b5fa32d28f724df407c6a901f1bb79635cd046
6
+ metadata.gz: f802b2a582dc0362351f7c26441e482bc064affa03828e033a948c26a9b6a3bdb42ad216768510b507bd8be6dc0692a34a8b1a0125cfa9533e7710d5ee2c1cba
7
+ data.tar.gz: 0bf700445e710a16a871fff6d7089aa44a3a8a53420e7f9e5a7a8cf959ccf6066a5e7b59bd23a7631b2c378571e34f5cfa4d5158a6b599d91bc1caa3560b79c9
@@ -1,3 +1,7 @@
1
+ ## 0.1.3 (2020-12-04)
2
+
3
+ - Added support for custom tokenization
4
+
1
5
  ## 0.1.2 (2020-09-14)
2
6
 
3
7
  - Added binary relation detection
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  - Finds people, organizations, and locations in text
6
6
  - Detects relationships between entities, like `PERSON` was born in `LOCATION`
7
7
 
8
- [![Build Status](https://travis-ci.org/ankane/mitie.svg?branch=master)](https://travis-ci.org/ankane/mitie) [![Build status](https://ci.appveyor.com/api/projects/status/stc89tc57xfva451/branch/master?svg=true)](https://ci.appveyor.com/project/ankane/mitie/branch/master)
8
+ [![Build Status](https://github.com/ankane/mitie/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mitie/actions)
9
9
 
10
10
  ## Installation
11
11
 
@@ -32,7 +32,7 @@ model = Mitie::NER.new("ner_model.dat")
32
32
  Create a document
33
33
 
34
34
  ```ruby
35
- doc = model.doc("Nat Friedman is the CEO of GitHub, which is headquartered in San Francisco")
35
+ doc = model.doc("Nat works at GitHub in San Francisco")
36
36
  ```
37
37
 
38
38
  Get entities
@@ -45,9 +45,9 @@ This returns
45
45
 
46
46
  ```ruby
47
47
  [
48
- {text: "Nat Friedman", tag: "PERSON", score: 1.099661347535191, offset: 0},
49
- {text: "GitHub", tag: "ORGANIZATION", score: 0.344641651251650, offset: 27},
50
- {text: "San Francisco", tag: "LOCATION", score: 1.428241888939011, offset: 61}
48
+ {text: "Nat", tag: "PERSON", score: 0.3112371212688382, offset: 0},
49
+ {text: "GitHub", tag: "ORGANIZATION", score: 0.5660115198329334, offset: 13},
50
+ {text: "San Francisco", tag: "LOCATION", score: 1.3890524313885309, offset: 23}
51
51
  ]
52
52
  ```
53
53
 
@@ -82,13 +82,13 @@ There are 21 detectors for English. You can find them in the `binary_relations`
82
82
  Load a detector
83
83
 
84
84
  ```ruby
85
- detector = Mitie::BinaryRelationDetector.new("rel_classifier_film.film.directed_by.svm")
85
+ detector = Mitie::BinaryRelationDetector.new("rel_classifier_organization.organization.place_founded.svm")
86
86
  ```
87
87
 
88
88
  And create a document
89
89
 
90
90
  ```ruby
91
- doc = model.doc("The Shawshank Redemption was directed by Frank Darabont")
91
+ doc = model.doc("Shopify was founded in Ottawa")
92
92
  ```
93
93
 
94
94
  Get relations
@@ -100,7 +100,7 @@ detector.relations(doc)
100
100
  This returns
101
101
 
102
102
  ```ruby
103
- [{first: "Shawshank Redemption", second: "Frank Darabont", score: 1.124211742912441}]
103
+ [{first: "Shopify", second: "Ottawa", score: 0.17649169745814464}]
104
104
  ```
105
105
 
106
106
  ## History
@@ -4,7 +4,7 @@ module Mitie
4
4
 
5
5
  def initialize(model, text)
6
6
  @model = model
7
- @text = text.to_s
7
+ @text = text
8
8
  end
9
9
 
10
10
  def tokens
@@ -13,16 +13,21 @@ module Mitie
13
13
 
14
14
  def tokens_with_offset
15
15
  @tokens_with_offset ||= begin
16
- i = 0
17
- tokens = []
18
- loop do
19
- token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
20
- break if token.null?
21
- offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
22
- tokens << [token.to_s.force_encoding(text.encoding), offset]
23
- i += 1
16
+ if text.is_a?(Array)
17
+ # offsets are unknown when given tokens
18
+ text.map { |v| [v, nil] }
19
+ else
20
+ i = 0
21
+ tokens = []
22
+ loop do
23
+ token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr
24
+ break if token.null?
25
+ offset = (offsets_ptr.ptr + i * Fiddle::SIZEOF_LONG).to_s(Fiddle::SIZEOF_LONG).unpack1("L!")
26
+ tokens << [token.to_s.force_encoding(text.encoding), offset]
27
+ i += 1
28
+ end
29
+ tokens
24
30
  end
25
- tokens
26
31
  end
27
32
  end
28
33
 
@@ -40,15 +45,20 @@ module Mitie
40
45
  score = FFI.mitie_ner_get_detection_score(detections, i)
41
46
  tok = tokens[pos, len]
42
47
  offset = tok[0][1]
43
- finish = tok[-1][1] + tok[-1][0].size
44
- entities << {
45
- text: text[offset...finish],
46
- tag: tag,
47
- score: score,
48
- offset: offset,
49
- token_index: pos,
50
- token_length: len
51
- }
48
+
49
+ entity = {}
50
+ if offset
51
+ finish = tok[-1][1] + tok[-1][0].size
52
+ entity[:text] = text[offset...finish]
53
+ else
54
+ entity[:text] = tok.map(&:first)
55
+ end
56
+ entity[:tag] = tag
57
+ entity[:score] = score
58
+ entity[:offset] = offset if offset
59
+ entity[:token_index] = pos
60
+ entity[:token_length] = len
61
+ entities << entity
52
62
  end
53
63
  entities
54
64
  ensure
@@ -73,13 +83,22 @@ module Mitie
73
83
 
74
84
  def tokenize
75
85
  @tokenize ||= begin
76
- offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
77
- tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
86
+ if text.is_a?(Array)
87
+ # malloc uses memset to set all bytes to 0
88
+ tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1))
89
+ text.size.times do |i|
90
+ tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = Fiddle::Pointer.to_ptr(text[i]).ref
91
+ end
92
+ [tokens_ptr, nil]
93
+ else
94
+ offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP)
95
+ tokens_ptr = FFI.mitie_tokenize_with_offsets(text, offsets_ptr)
78
96
 
79
- ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
80
- ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
97
+ ObjectSpace.define_finalizer(tokens_ptr, self.class.finalize(tokens_ptr))
98
+ ObjectSpace.define_finalizer(offsets_ptr, self.class.finalize_ptr(offsets_ptr))
81
99
 
82
- [tokens_ptr, offsets_ptr]
100
+ [tokens_ptr, offsets_ptr]
101
+ end
83
102
  end
84
103
  end
85
104
 
@@ -1,3 +1,3 @@
1
1
  module Mitie
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mitie
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-15 00:00:00.000000000 Z
11
+ date: 2020-12-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,7 +52,7 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '5'
55
- description:
55
+ description:
56
56
  email: andrew@chartkick.com
57
57
  executables: []
58
58
  extensions: []
@@ -75,7 +75,7 @@ homepage: https://github.com/ankane/mitie
75
75
  licenses:
76
76
  - BSL-1.0
77
77
  metadata: {}
78
- post_install_message:
78
+ post_install_message:
79
79
  rdoc_options: []
80
80
  require_paths:
81
81
  - lib
@@ -90,8 +90,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
90
  - !ruby/object:Gem::Version
91
91
  version: '0'
92
92
  requirements: []
93
- rubygems_version: 3.1.2
94
- signing_key:
93
+ rubygems_version: 3.1.4
94
+ signing_key:
95
95
  specification_version: 4
96
96
  summary: Named-entity recognition for Ruby
97
97
  test_files: []