ruby-spacy 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/Gemfile.lock +1 -1
- data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
- data/examples/get_started/similarity.rb +2 -2
- data/examples/japanese/visualizing_dependencies.rb +2 -2
- data/examples/japanese/visualizing_named_entities.rb +1 -1
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +1 -1
- data/examples/linguistic_features/similarity.rb +2 -2
- data/examples/linguistic_features/similarity_between_spans.rb +2 -2
- data/lib/ruby-spacy.rb +331 -325
- data/lib/ruby-spacy/version.rb +1 -1
- metadata +4 -4
- data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
|
4
|
+
data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
|
7
|
+
data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c
|
data/CHANGELOG.md
ADDED
data/Gemfile.lock
CHANGED
@@ -2,7 +2,7 @@ require "ruby-spacy"
|
|
2
2
|
require "terminal-table"
|
3
3
|
|
4
4
|
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
-
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
5
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion.")
|
6
6
|
|
7
7
|
headings = ["text", "lemma", "pos", "tag", "dep"]
|
8
8
|
rows = []
|
@@ -4,8 +4,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
4
4
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
5
5
|
doc2 = nlp.read("Fast food tastes very good.")
|
6
6
|
|
7
|
-
puts "Doc 1: " + doc1
|
8
|
-
puts "Doc 2: " + doc2
|
7
|
+
puts "Doc 1: " + doc1.text
|
8
|
+
puts "Doc 2: " + doc2.text
|
9
9
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
10
10
|
|
11
11
|
# Doc 1: I like salty fries and hamburgers.
|
@@ -6,8 +6,8 @@ nlp = Spacy::Language.new("ja_core_news_sm")
|
|
6
6
|
sentence = "自動運転車は保険責任を製造者に転嫁する。"
|
7
7
|
doc = nlp.read(sentence)
|
8
8
|
|
9
|
-
dep_svg = doc.displacy('dep', false)
|
9
|
+
dep_svg = doc.displacy(style: 'dep', compact: false)
|
10
10
|
|
11
|
-
File.open(File.join(File.dirname(__FILE__), "
|
11
|
+
File.open(File.join(File.dirname(__FILE__), "test_dep.svg"), "w") do |file|
|
12
12
|
file.write(dep_svg)
|
13
13
|
end
|
@@ -5,7 +5,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
5
5
|
|
6
6
|
doc = nlp.read("bright red apples on the tree")
|
7
7
|
|
8
|
-
puts "Text: " + doc
|
8
|
+
puts "Text: " + doc.text
|
9
9
|
|
10
10
|
puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
|
11
11
|
puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
|
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
5
5
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
6
|
doc2 = nlp.read("Fast food tastes very good.")
|
7
7
|
|
8
|
-
puts "Doc 1: " + doc1
|
9
|
-
puts "Doc 2: " + doc2
|
8
|
+
puts "Doc 1: " + doc1.text
|
9
|
+
puts "Doc 2: " + doc2.text
|
10
10
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
11
|
|
12
12
|
# Doc 1: I like salty fries and hamburgers.
|
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
5
5
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
6
|
doc2 = nlp.read("Fast food tastes very good.")
|
7
7
|
|
8
|
-
puts "Doc 1: " + doc1
|
9
|
-
puts "Doc 2: " + doc2
|
8
|
+
puts "Doc 1: " + doc1.text
|
9
|
+
puts "Doc 2: " + doc2.text
|
10
10
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
11
|
|
12
12
|
span1 = doc1.span(2, 2) # salty fries
|
data/lib/ruby-spacy.rb
CHANGED
@@ -3,12 +3,34 @@
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
4
|
require 'enumerator'
|
5
5
|
require 'strscan'
|
6
|
-
require 'pycall/import'
|
7
6
|
require 'numpy'
|
7
|
+
require 'pycall/import'
|
8
8
|
include PyCall::Import
|
9
9
|
|
10
10
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
11
|
module Spacy
|
12
|
+
|
13
|
+
extend PyCall::Import
|
14
|
+
spacy = PyCall.import_module('spacy')
|
15
|
+
|
16
|
+
# Python `Language` class
|
17
|
+
PyLanguage = spacy.language.Language
|
18
|
+
|
19
|
+
# Python `Doc` class object
|
20
|
+
PyDoc = spacy.tokens.Doc
|
21
|
+
|
22
|
+
# Python `Span` class object
|
23
|
+
PySpan = spacy.tokens.Span
|
24
|
+
|
25
|
+
# Python `Token` class object
|
26
|
+
PyToken = spacy.tokens.Token
|
27
|
+
|
28
|
+
# Python `Matcher` class object
|
29
|
+
PyMatcher = spacy.matcher.Matcher
|
30
|
+
|
31
|
+
# Python `displacy` object
|
32
|
+
PyDisplacy = spacy.displacy
|
33
|
+
|
12
34
|
# A utility module method to convert Python's generator object to a Ruby array,
|
13
35
|
# mainly used on the items inside the array returned from dependency-related methods
|
14
36
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
@@ -16,12 +38,303 @@ module Spacy
|
|
16
38
|
PyCall::List.(py_generator)
|
17
39
|
end
|
18
40
|
|
41
|
+
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
|
+
class Doc
|
43
|
+
|
44
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
|
+
attr_reader :py_nlp
|
46
|
+
|
47
|
+
# @return [Object] a Python `Doc` instance accessible via `PyCall`
|
48
|
+
attr_reader :py_doc
|
49
|
+
|
50
|
+
# @return [String] a text string of the document
|
51
|
+
attr_reader :text
|
52
|
+
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
alias_method :length, :count
|
56
|
+
alias_method :len, :count
|
57
|
+
alias_method :size, :count
|
58
|
+
|
59
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
|
+
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
|
+
# @param nlp [Language] an instance of {Language} class
|
63
|
+
# @param py_doc [Object] an instance of Python `Doc` class
|
64
|
+
# @param text [String] the text string to be analyzed
|
65
|
+
def initialize(nlp, py_doc: nil, text: nil)
|
66
|
+
@py_nlp = nlp
|
67
|
+
if py_doc
|
68
|
+
@py_doc = py_doc
|
69
|
+
else
|
70
|
+
@py_doc = nlp.(text)
|
71
|
+
end
|
72
|
+
@text = @py_doc.text
|
73
|
+
end
|
74
|
+
|
75
|
+
# Retokenizes the text merging a span into a single token.
|
76
|
+
# @param start_index [Integer] the start position of the span to be retokenized in the document
|
77
|
+
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
|
+
# @param attributes [Hash] attributes to set on the merged token
|
79
|
+
def retokenize(start_index, end_index, attributes = {})
|
80
|
+
PyCall.with(@py_doc.retokenize()) do |retokenizer|
|
81
|
+
retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Retokenizes the text splitting the specified token.
|
86
|
+
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
|
+
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
|
+
# @param attributes [Hash] the attributes of the split elements
|
90
|
+
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
+
PyCall.with(@py_doc.retokenize()) do |retokenizer|
|
92
|
+
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
|
+
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# String representation of the document.
|
98
|
+
# @return [String]
|
99
|
+
def to_s
|
100
|
+
@text
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns an array of tokens contained in the doc.
|
104
|
+
# @return [Array<Token>]
|
105
|
+
def tokens
|
106
|
+
results = []
|
107
|
+
PyCall::List.(@py_doc).each do |py_token|
|
108
|
+
results << Token.new(py_token)
|
109
|
+
end
|
110
|
+
results
|
111
|
+
end
|
112
|
+
|
113
|
+
# Iterates over the elements in the doc yielding a token instance each time.
|
114
|
+
def each
|
115
|
+
PyCall::List.(@py_doc).each do |py_token|
|
116
|
+
yield Token.new(py_token)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Returns a span of the specified range within the doc.
|
121
|
+
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
|
+
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
|
+
# @param optional_size [Integer] an integer representing the size of the span
|
124
|
+
# @return [Span]
|
125
|
+
def span(range_or_start, optional_size = nil)
|
126
|
+
if optional_size
|
127
|
+
start_index = range_or_start
|
128
|
+
temp = tokens[start_index ... start_index + optional_size]
|
129
|
+
else
|
130
|
+
start_index = range_or_start.first
|
131
|
+
range = range_or_start
|
132
|
+
temp = tokens[range]
|
133
|
+
end
|
134
|
+
|
135
|
+
end_index = start_index + temp.size - 1
|
136
|
+
|
137
|
+
Span.new(self, start_index: start_index, end_index: end_index)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Returns an array of spans representing noun chunks.
|
141
|
+
# @return [Array<Span>]
|
142
|
+
def noun_chunks
|
143
|
+
chunk_array = []
|
144
|
+
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
145
|
+
py_chunks.each do |py_chunk|
|
146
|
+
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
|
+
end
|
148
|
+
chunk_array
|
149
|
+
end
|
150
|
+
|
151
|
+
# Returns an array of spans each representing a sentence.
|
152
|
+
# @return [Array<Span>]
|
153
|
+
def sents
|
154
|
+
sentence_array = []
|
155
|
+
py_sentences = PyCall::List.(@py_doc.sents)
|
156
|
+
py_sentences.each do |py_sent|
|
157
|
+
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
|
+
end
|
159
|
+
sentence_array
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns an array of spans each representing a named entity.
|
163
|
+
# @return [Array<Span>]
|
164
|
+
def ents
|
165
|
+
# so that ents canbe "each"-ed in Ruby
|
166
|
+
ent_array = []
|
167
|
+
PyCall::List.(@py_doc.ents).each do |ent|
|
168
|
+
ent_array << ent
|
169
|
+
end
|
170
|
+
ent_array
|
171
|
+
end
|
172
|
+
|
173
|
+
# Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
|
174
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
175
|
+
def [](range)
|
176
|
+
if range.is_a?(Range)
|
177
|
+
py_span = @py_doc[range]
|
178
|
+
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
179
|
+
else
|
180
|
+
return Token.new(@py_doc[range])
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns a semantic similarity estimate.
|
185
|
+
# @param other [Doc] the other doc to which a similarity estimation is made
|
186
|
+
# @return [Float]
|
187
|
+
def similarity(other)
|
188
|
+
py_doc.similarity(other.py_doc)
|
189
|
+
end
|
190
|
+
|
191
|
+
# Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
|
192
|
+
# @param style [String] either `dep` or `ent`
|
193
|
+
# @param compact [Boolean] only relevant to the `dep' style
|
194
|
+
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
195
|
+
def displacy(style: "dep", compact: false)
|
196
|
+
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
197
|
+
end
|
198
|
+
|
199
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
200
|
+
def method_missing(name, *args)
|
201
|
+
@py_doc.send(name, *args)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
206
|
+
class Language
|
207
|
+
|
208
|
+
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
209
|
+
attr_reader :spacy_nlp_id
|
210
|
+
|
211
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
212
|
+
attr_reader :py_nlp
|
213
|
+
|
214
|
+
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
215
|
+
# @param model [String] A language model installed in the system
|
216
|
+
def initialize(model = "en_core_web_sm")
|
217
|
+
@spacy_nlp_id = "nlp_#{model.object_id}"
|
218
|
+
PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
|
219
|
+
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Reads and analyze the given text.
|
223
|
+
# @param text [String] a text to be read and analyzed
|
224
|
+
def read(text)
|
225
|
+
Doc.new(py_nlp, text: text)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Generates a matcher for the current language model.
|
229
|
+
# @return [Matcher]
|
230
|
+
def matcher
|
231
|
+
Matcher.new(@py_nlp)
|
232
|
+
end
|
233
|
+
|
234
|
+
# A utility method to lookup a vocabulary item of the given id.
|
235
|
+
# @param id [Integer] a vocabulary id
|
236
|
+
# @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
|
237
|
+
def vocab_string_lookup(id)
|
238
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
239
|
+
end
|
240
|
+
|
241
|
+
# A utility method to list pipeline components.
|
242
|
+
# @return [Array<String>] An array of text strings representing pipeline components
|
243
|
+
def pipe_names
|
244
|
+
pipe_array = []
|
245
|
+
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
246
|
+
pipe_array << pipe
|
247
|
+
end
|
248
|
+
pipe_array
|
249
|
+
end
|
250
|
+
|
251
|
+
# A utility method to get a Python `Lexeme` object.
|
252
|
+
# @param text [String] A text string representing a lexeme
|
253
|
+
# @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
|
254
|
+
def get_lexeme(text)
|
255
|
+
text = text.gsub("'", "\'")
|
256
|
+
@py_nlp.vocab[text]
|
257
|
+
end
|
258
|
+
|
259
|
+
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
260
|
+
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
261
|
+
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
262
|
+
def most_similar(vector, n)
|
263
|
+
vec_array = Numpy.asarray([vector])
|
264
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
265
|
+
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
|
266
|
+
keys = key_texts.map{|kt| kt[0]}
|
267
|
+
texts = key_texts.map{|kt| kt[1]}
|
268
|
+
best_rows = PyCall::List.(py_result[1])[0]
|
269
|
+
scores = PyCall::List.(py_result[2])[0]
|
270
|
+
|
271
|
+
results = []
|
272
|
+
n.times do |i|
|
273
|
+
results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
|
274
|
+
end
|
275
|
+
results
|
276
|
+
end
|
277
|
+
|
278
|
+
# Utility function to batch process many texts
|
279
|
+
# @param texts [String]
|
280
|
+
# @param disable [Array<String>]
|
281
|
+
# @param batch_size [Integer]
|
282
|
+
# @return [Array<Doc>]
|
283
|
+
def pipe(texts, disable: [], batch_size: 50)
|
284
|
+
docs = []
|
285
|
+
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
286
|
+
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
287
|
+
end
|
288
|
+
docs
|
289
|
+
end
|
290
|
+
|
291
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
|
292
|
+
def method_missing(name, *args)
|
293
|
+
@py_nlp.send(name, *args)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
298
|
+
class Matcher
|
299
|
+
|
300
|
+
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
301
|
+
attr_reader :py_matcher
|
302
|
+
|
303
|
+
# Creates a {Matcher} instance
|
304
|
+
# @param nlp [Language] an instance of {Language} class
|
305
|
+
def initialize(nlp)
|
306
|
+
@py_matcher = PyMatcher.(nlp.vocab)
|
307
|
+
end
|
308
|
+
|
309
|
+
# Adds a label string and a text pattern.
|
310
|
+
# @param text [String] a label string given to the pattern
|
311
|
+
# @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
|
312
|
+
def add(text, pattern)
|
313
|
+
@py_matcher.add(text, pattern)
|
314
|
+
end
|
315
|
+
|
316
|
+
# Execute the match.
|
317
|
+
# @param doc [Doc] an {Doc} instance
|
318
|
+
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
319
|
+
def match(doc)
|
320
|
+
str_results = @py_matcher.(doc.py_doc).to_s
|
321
|
+
s = StringScanner.new(str_results[1..-2])
|
322
|
+
results = []
|
323
|
+
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
324
|
+
next unless s.matched
|
325
|
+
triple = s.matched.split(", ")
|
326
|
+
match_id = triple[0].to_i
|
327
|
+
start_index = triple[1].to_i
|
328
|
+
end_index = triple[2].to_i - 1
|
329
|
+
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
330
|
+
end
|
331
|
+
results
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
19
335
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
20
336
|
class Span
|
21
337
|
|
22
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
23
|
-
attr_reader :spacy_span_id
|
24
|
-
|
25
338
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
26
339
|
attr_reader :py_span
|
27
340
|
|
@@ -35,21 +348,18 @@ module Spacy
|
|
35
348
|
alias_method :size, :count
|
36
349
|
|
37
350
|
# It is recommended to use {Doc#span} method to create a span. If you need to
|
38
|
-
# create one using {Span#initialize},
|
351
|
+
# create one using {Span#initialize}, there are two method signatures:
|
352
|
+
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
39
353
|
# @param doc [Doc] the document to which this span belongs to
|
40
354
|
# @param start_index [Integer] the index of the item starting the span inside a doc
|
41
355
|
# @param end_index [Integer] the index of the item ending the span inside a doc
|
42
356
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
43
357
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
44
358
|
@doc = doc
|
45
|
-
@spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
|
46
359
|
if py_span
|
47
360
|
@py_span = py_span
|
48
361
|
else
|
49
|
-
|
50
|
-
PyCall.exec("#{@spacy_span_id}_opts = #{options}")
|
51
|
-
PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
|
52
|
-
@py_span = PyCall.eval(@spacy_span_id)
|
362
|
+
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
53
363
|
end
|
54
364
|
end
|
55
365
|
|
@@ -63,7 +373,7 @@ module Spacy
|
|
63
373
|
results
|
64
374
|
end
|
65
375
|
|
66
|
-
# Iterates over the elements in the span yielding a token instance.
|
376
|
+
# Iterates over the elements in the span yielding a token instance each time.
|
67
377
|
def each
|
68
378
|
PyCall::List.(@py_span).each do |py_token|
|
69
379
|
yield Token.new(py_token)
|
@@ -97,7 +407,6 @@ module Spacy
|
|
97
407
|
def ents
|
98
408
|
ent_array = []
|
99
409
|
PyCall::List.(@py_span.ents).each do |py_span|
|
100
|
-
# ent_array << ent
|
101
410
|
ent_array << Spacy::Span.new(@doc, py_span: py_span)
|
102
411
|
end
|
103
412
|
ent_array
|
@@ -106,11 +415,11 @@ module Spacy
|
|
106
415
|
# Returns a span that represents the sentence that the given span is part of.
|
107
416
|
# @return [Span]
|
108
417
|
def sent
|
109
|
-
py_span
|
418
|
+
py_span = @py_span.sent
|
110
419
|
return Spacy::Span.new(@doc, py_span: py_span)
|
111
420
|
end
|
112
421
|
|
113
|
-
# Returns a span if a range object is given
|
422
|
+
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
114
423
|
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
115
424
|
def [](range)
|
116
425
|
if range.is_a?(Range)
|
@@ -125,16 +434,16 @@ module Spacy
|
|
125
434
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
126
435
|
# @return [Float]
|
127
436
|
def similarity(other)
|
128
|
-
|
437
|
+
py_span.similarity(other.py_span)
|
129
438
|
end
|
130
439
|
|
131
|
-
# Creates a document instance
|
440
|
+
# Creates a document instance from the span
|
132
441
|
# @return [Doc]
|
133
442
|
def as_doc
|
134
|
-
Spacy::Doc.new(@doc.
|
443
|
+
Spacy::Doc.new(@doc.py_nlp, text: self.text)
|
135
444
|
end
|
136
445
|
|
137
|
-
# Returns
|
446
|
+
# Returns tokens conjugated to the root of the span.
|
138
447
|
# @return [Array<Token>] an array of tokens
|
139
448
|
def conjuncts
|
140
449
|
conjunct_array = []
|
@@ -144,7 +453,7 @@ module Spacy
|
|
144
453
|
conjunct_array
|
145
454
|
end
|
146
455
|
|
147
|
-
# Returns
|
456
|
+
# Returns tokens that are to the left of the span, whose heads are within the span.
|
148
457
|
# @return [Array<Token>] an array of tokens
|
149
458
|
def lefts
|
150
459
|
left_array = []
|
@@ -189,7 +498,8 @@ module Spacy
|
|
189
498
|
# @return [String] a string representing the token
|
190
499
|
attr_reader :text
|
191
500
|
|
192
|
-
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
501
|
+
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
502
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
|
193
503
|
# @param py_token [Object] Python `Token` object
|
194
504
|
def initialize(py_token)
|
195
505
|
@py_token = py_token
|
@@ -253,7 +563,7 @@ module Spacy
|
|
253
563
|
end
|
254
564
|
|
255
565
|
# Returns a hash or string of morphological information
|
256
|
-
# @param
|
566
|
+
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
257
567
|
# @return [Hash, String]
|
258
568
|
def morphology(hash = true)
|
259
569
|
if @py_token.has_morph
|
@@ -278,310 +588,6 @@ module Spacy
|
|
278
588
|
end
|
279
589
|
end
|
280
590
|
|
281
|
-
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
282
|
-
class Doc
|
283
|
-
|
284
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
285
|
-
attr_reader :spacy_nlp_id
|
286
|
-
|
287
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
288
|
-
attr_reader :spacy_doc_id
|
289
|
-
|
290
|
-
# @return [Object] a Python `Doc` instance accessible via `PyCall`
|
291
|
-
attr_reader :py_doc
|
292
|
-
|
293
|
-
# @return [String] a text string of the document
|
294
|
-
attr_reader :text
|
295
|
-
|
296
|
-
include Enumerable
|
297
|
-
|
298
|
-
alias_method :length, :count
|
299
|
-
alias_method :len, :count
|
300
|
-
alias_method :size, :count
|
301
|
-
|
302
|
-
# Creates a new instance of {Doc}.
|
303
|
-
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
304
|
-
# @param text [String] The text string to be analyzed
|
305
|
-
def initialize(nlp_id, text)
|
306
|
-
@text = text
|
307
|
-
@spacy_nlp_id = nlp_id
|
308
|
-
@spacy_doc_id = "doc_#{text.object_id}"
|
309
|
-
quoted = text.gsub('"', '\"')
|
310
|
-
PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
|
311
|
-
PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
|
312
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
313
|
-
end
|
314
|
-
|
315
|
-
|
316
|
-
# Retokenizes the text merging a span into a single token.
|
317
|
-
# @param start_index [Integer] The start position of the span to be retokenized in the document
|
318
|
-
# @param end_index [Integer] The end position of the span to be retokenized in the document
|
319
|
-
# @param attributes [Hash] Attributes to set on the merged token
|
320
|
-
def retokenize(start_index, end_index, attributes = {})
|
321
|
-
py_attrs = PyCall::Dict.(attributes)
|
322
|
-
PyCall.exec(<<PY)
|
323
|
-
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
324
|
-
retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
|
325
|
-
PY
|
326
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
327
|
-
end
|
328
|
-
|
329
|
-
# Retokenizes the text splitting the specified token.
|
330
|
-
# @param pos_in_doc [Integer] The position of the span to be retokenized in the document
|
331
|
-
# @param split_array [Array<String>] text strings of the split results
|
332
|
-
# @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
|
333
|
-
# @param attributes [Hash] The attributes of the split elements
|
334
|
-
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
335
|
-
py_attrs = PyCall::Dict.(attributes)
|
336
|
-
py_split_array = PyCall::List.(split_array)
|
337
|
-
PyCall.exec(<<PY)
|
338
|
-
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
339
|
-
heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
|
340
|
-
attrs = #{py_attrs}
|
341
|
-
split_array = #{py_split_array}
|
342
|
-
retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
|
343
|
-
PY
|
344
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
345
|
-
end
|
346
|
-
|
347
|
-
# String representation of the token.
|
348
|
-
# @return [String]
|
349
|
-
def to_s
|
350
|
-
@text
|
351
|
-
end
|
352
|
-
|
353
|
-
# Returns an array of tokens contained in the doc.
|
354
|
-
# @return [Array<Token>]
|
355
|
-
def tokens
|
356
|
-
results = []
|
357
|
-
PyCall::List.(@py_doc).each do |py_token|
|
358
|
-
results << Token.new(py_token)
|
359
|
-
end
|
360
|
-
results
|
361
|
-
end
|
362
|
-
|
363
|
-
# Iterates over the elements in the doc yielding a token instance.
|
364
|
-
def each
|
365
|
-
PyCall::List.(@py_doc).each do |py_token|
|
366
|
-
yield Token.new(py_token)
|
367
|
-
end
|
368
|
-
end
|
369
|
-
|
370
|
-
# Returns a span of the specified range within the doc.
|
371
|
-
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
372
|
-
# @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
|
373
|
-
# @param optional_size [Integer] An integer representing the size of the span
|
374
|
-
# @return [Span]
|
375
|
-
def span(range_or_start, optional_size = nil)
|
376
|
-
if optional_size
|
377
|
-
start_index = range_or_start
|
378
|
-
temp = tokens[start_index ... start_index + optional_size]
|
379
|
-
else
|
380
|
-
start_index = range_or_start.first
|
381
|
-
range = range_or_start
|
382
|
-
temp = tokens[range]
|
383
|
-
end
|
384
|
-
|
385
|
-
end_index = start_index + temp.size - 1
|
386
|
-
|
387
|
-
Span.new(self, start_index: start_index, end_index: end_index)
|
388
|
-
end
|
389
|
-
|
390
|
-
# Returns an array of spans representing noun chunks.
|
391
|
-
# @return [Array<Span>]
|
392
|
-
def noun_chunks
|
393
|
-
chunk_array = []
|
394
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
395
|
-
py_chunks.each do |py_chunk|
|
396
|
-
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
397
|
-
end
|
398
|
-
chunk_array
|
399
|
-
end
|
400
|
-
|
401
|
-
# Returns an array of spans representing sentences.
|
402
|
-
# @return [Array<Span>]
|
403
|
-
def sents
|
404
|
-
sentence_array = []
|
405
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
406
|
-
py_sentences.each do |py_sent|
|
407
|
-
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
408
|
-
end
|
409
|
-
sentence_array
|
410
|
-
end
|
411
|
-
|
412
|
-
# Returns an array of spans representing named entities.
|
413
|
-
# @return [Array<Span>]
|
414
|
-
def ents
|
415
|
-
# so that ents canbe "each"-ed in Ruby
|
416
|
-
ent_array = []
|
417
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
418
|
-
ent_array << ent
|
419
|
-
end
|
420
|
-
ent_array
|
421
|
-
end
|
422
|
-
|
423
|
-
# Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
|
424
|
-
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
425
|
-
def [](range)
|
426
|
-
if range.is_a?(Range)
|
427
|
-
py_span = @py_doc[range]
|
428
|
-
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
429
|
-
else
|
430
|
-
return Token.new(@py_doc[range])
|
431
|
-
end
|
432
|
-
end
|
433
|
-
|
434
|
-
# Returns a semantic similarity estimate.
|
435
|
-
# @param other [Doc] the other doc to which a similarity estimation is made
|
436
|
-
# @return [Float]
|
437
|
-
def similarity(other)
|
438
|
-
PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
|
439
|
-
end
|
440
|
-
|
441
|
-
# Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
|
442
|
-
# @param style [String] Either `dep` or `ent`
|
443
|
-
# @param compact [Boolean] Only relevant to the `dep' style
|
444
|
-
# @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
|
445
|
-
def displacy(style: "dep", compact: false)
|
446
|
-
PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
|
447
|
-
end
|
448
|
-
|
449
|
-
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
450
|
-
def method_missing(name, *args)
|
451
|
-
@py_doc.send(name, *args)
|
452
|
-
end
|
453
|
-
end
|
454
|
-
|
455
|
-
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
456
|
-
class Matcher
|
457
|
-
|
458
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
459
|
-
attr_reader :spacy_matcher_id
|
460
|
-
|
461
|
-
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
462
|
-
attr_reader :py_matcher
|
463
|
-
|
464
|
-
# Creates a {Matcher} instance
|
465
|
-
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
466
|
-
def initialize(nlp_id)
|
467
|
-
@spacy_matcher_id = "doc_#{nlp_id}_matcher"
|
468
|
-
PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
|
469
|
-
@py_matcher = PyCall.eval(@spacy_matcher_id)
|
470
|
-
end
|
471
|
-
|
472
|
-
# Adds a label string and a text pattern.
|
473
|
-
# @param text [String] a label string given to the pattern
|
474
|
-
# @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
|
475
|
-
def add(text, pattern)
|
476
|
-
@py_matcher.add(text, pattern)
|
477
|
-
end
|
478
|
-
|
479
|
-
# Execute the match.
|
480
|
-
# @param doc [Doc] An {Doc} instance
|
481
|
-
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
|
482
|
-
def match(doc)
|
483
|
-
str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
|
484
|
-
s = StringScanner.new(str_results[1..-2])
|
485
|
-
results = []
|
486
|
-
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
487
|
-
next unless s.matched
|
488
|
-
triple = s.matched.split(", ")
|
489
|
-
match_id = triple[0].to_i
|
490
|
-
start_index = triple[1].to_i
|
491
|
-
end_index = triple[2].to_i - 1
|
492
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
493
|
-
end
|
494
|
-
results
|
495
|
-
end
|
496
|
-
end
|
497
|
-
|
498
|
-
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
499
|
-
class Language
|
500
|
-
|
501
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
502
|
-
attr_reader :spacy_nlp_id
|
503
|
-
|
504
|
-
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
505
|
-
attr_reader :py_nlp
|
506
|
-
|
507
|
-
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
508
|
-
# @param model [String] A language model installed in the system
|
509
|
-
def initialize(model = "en_core_web_sm")
|
510
|
-
@spacy_nlp_id = "nlp_#{model.object_id}"
|
511
|
-
PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
|
512
|
-
PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
|
513
|
-
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
514
|
-
end
|
515
|
-
|
516
|
-
# Reads and analyze the given text.
|
517
|
-
# @param text [String] A text to be read and analyzed
|
518
|
-
def read(text)
|
519
|
-
Doc.new(@spacy_nlp_id, text)
|
520
|
-
end
|
521
|
-
|
522
|
-
# Generates a matcher for the current language model.
|
523
|
-
# @return [Matcher]
|
524
|
-
def matcher
|
525
|
-
Matcher.new(@spacy_nlp_id)
|
526
|
-
end
|
527
|
-
|
528
|
-
# A utility method to lookup a vocabulary item of the given id.
|
529
|
-
# @param id [Integer] A vocabulary id
|
530
|
-
# @return [Object] A Python `Lexeme` object
|
531
|
-
def vocab_string_lookup(id)
|
532
|
-
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
533
|
-
end
|
534
|
-
|
535
|
-
# A utility method to list pipeline components.
|
536
|
-
# @return [Array<String>] An array of text strings representing pipeline components
|
537
|
-
def pipe_names
|
538
|
-
pipe_array = []
|
539
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
540
|
-
pipe_array << pipe
|
541
|
-
end
|
542
|
-
pipe_array
|
543
|
-
end
|
544
|
-
|
545
|
-
# A utility method to get the tokenizer Python object.
|
546
|
-
# @return [Object] Python `Tokenizer` object
|
547
|
-
def tokenizer
|
548
|
-
return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
|
549
|
-
end
|
550
|
-
|
551
|
-
# A utility method to get a Python `Lexeme` object.
|
552
|
-
# @param text [String] A text string representing a lexeme
|
553
|
-
# @return [Object] Python `Tokenizer` object
|
554
|
-
def get_lexeme(text)
|
555
|
-
text = text.gsub("'", "\'")
|
556
|
-
py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
|
557
|
-
return py_lexeme
|
558
|
-
end
|
559
|
-
|
560
|
-
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
561
|
-
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
562
|
-
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
563
|
-
def most_similar(vector, n)
|
564
|
-
vec_array = Numpy.asarray([vector])
|
565
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
566
|
-
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
|
567
|
-
keys = key_texts.map{|kt| kt[0]}
|
568
|
-
texts = key_texts.map{|kt| kt[1]}
|
569
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
570
|
-
scores = PyCall::List.(py_result[2])[0]
|
571
|
-
|
572
|
-
results = []
|
573
|
-
n.times do |i|
|
574
|
-
results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
|
575
|
-
end
|
576
|
-
|
577
|
-
results
|
578
|
-
end
|
579
|
-
|
580
|
-
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
|
581
|
-
def method_missing(name, *args)
|
582
|
-
@py_nlp.send(name, *args)
|
583
|
-
end
|
584
|
-
end
|
585
591
|
|
586
592
|
end
|
587
593
|
|
data/lib/ruby-spacy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spacy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-06-
|
11
|
+
date: 2021-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pycall
|
@@ -66,6 +66,7 @@ extra_rdoc_files: []
|
|
66
66
|
files:
|
67
67
|
- ".gitignore"
|
68
68
|
- ".yardopts"
|
69
|
+
- CHANGELOG.md
|
69
70
|
- Gemfile
|
70
71
|
- Gemfile.lock
|
71
72
|
- LICENSE.txt
|
@@ -123,7 +124,6 @@ files:
|
|
123
124
|
- examples/linguistic_features/sentence_segmentation.rb
|
124
125
|
- examples/linguistic_features/similarity.rb
|
125
126
|
- examples/linguistic_features/similarity_between_spans.rb
|
126
|
-
- examples/linguistic_features/special_case_tokenization_rules.rb
|
127
127
|
- examples/linguistic_features/tokenization.rb
|
128
128
|
- examples/rule_based_matching/creating_spans_from_matches.rb
|
129
129
|
- examples/rule_based_matching/matcher.rb
|
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
149
|
- !ruby/object:Gem::Version
|
150
150
|
version: '0'
|
151
151
|
requirements: []
|
152
|
-
rubygems_version: 3.2.
|
152
|
+
rubygems_version: 3.2.3
|
153
153
|
signing_key:
|
154
154
|
specification_version: 4
|
155
155
|
summary: A wrapper module for using spaCy natural language processing library from
|
@@ -1,19 +0,0 @@
|
|
1
|
-
require "ruby-spacy"
|
2
|
-
require "terminal-table"
|
3
|
-
|
4
|
-
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
-
|
6
|
-
doc = nlp.read("gimme that")
|
7
|
-
|
8
|
-
puts doc.tokens.join(" ")
|
9
|
-
|
10
|
-
# Add special case rule
|
11
|
-
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
|
12
|
-
tokenizer = nlp.tokenizer
|
13
|
-
tokenizer.add_special_case("gimme", special_case)
|
14
|
-
|
15
|
-
# Check new tokenization
|
16
|
-
puts nlp.read("gimme that").tokens.join(" ")
|
17
|
-
|
18
|
-
# gimme that
|
19
|
-
# gim me that
|