ruby-spacy 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9add9d3b065bbf5064652cb115f824221d929a20478d182782df5db564cc8f45
4
- data.tar.gz: f07d502f79883a452e7f250f0fe784425511a0de4f8a43db0b29ca03801bd755
3
+ metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
4
+ data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
5
5
  SHA512:
6
- metadata.gz: 373c795a148034f4191cfaf130a23f464dc2b43927bf6aa3165999c78797365ce2f976021ea8b9ab1dd083736e5f9a1da51a5ccf0156d00ec39dac9fd19bde7c
7
- data.tar.gz: e370e503c23d15a0a44be84bf578775b0a4acc5557468c7fc9468cde44e0e084018be8dc17c3e7c21d9efdaf229611ca234614fcd2e811272051c7c2922b408d
6
+ metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
7
+ data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c
data/CHANGELOG.md ADDED
@@ -0,0 +1,14 @@
1
+ # Change Log
2
+
3
+ ## 0.1.3 - 2021-06-26
4
+ - Code cleanup
5
+
6
+ ## 0.1.2 - 2021-06-26
7
+ ### Added
8
+ - `Spacy::Token#morpheme` method
9
+
10
+ ## 0.1.1 - 2021-06-26
11
+ - Project description fixed
12
+
13
+ ## 0.1.0 - 2021-06-26
14
+ - Initial release
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby-spacy (0.1.2)
4
+ ruby-spacy (0.1.3)
5
5
  numpy (~> 0.4.0)
6
6
  pycall (~> 1.4.0)
7
7
  terminal-table (~> 3.0.1)
@@ -2,7 +2,7 @@ require "ruby-spacy"
2
2
  require "terminal-table"
3
3
 
4
4
  nlp = Spacy::Language.new("en_core_web_sm")
5
- doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
5
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion.")
6
6
 
7
7
  headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
@@ -4,8 +4,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
4
4
  doc1 = nlp.read("I like salty fries and hamburgers.")
5
5
  doc2 = nlp.read("Fast food tastes very good.")
6
6
 
7
- puts "Doc 1: " + doc1
8
- puts "Doc 2: " + doc2
7
+ puts "Doc 1: " + doc1.text
8
+ puts "Doc 2: " + doc2.text
9
9
  puts "Similarity: #{doc1.similarity(doc2)}"
10
10
 
11
11
  # Doc 1: I like salty fries and hamburgers.
@@ -6,8 +6,8 @@ nlp = Spacy::Language.new("ja_core_news_sm")
6
6
  sentence = "自動運転車は保険責任を製造者に転嫁する。"
7
7
  doc = nlp.read(sentence)
8
8
 
9
- dep_svg = doc.displacy('dep', false)
9
+ dep_svg = doc.displacy(style: 'dep', compact: false)
10
10
 
11
- File.open(File.join(File.dirname(__FILE__), "outputs/test_dep.svg"), "w") do |file|
11
+ File.open(File.join(File.dirname(__FILE__), "test_dep.svg"), "w") do |file|
12
12
  file.write(dep_svg)
13
13
  end
@@ -7,7 +7,7 @@ sentence ="セバスチアン・スランが2007年にグーグルで自動運
7
7
 
8
8
  doc = nlp.read(sentence)
9
9
 
10
- ent_html = doc.displacy('ent')
10
+ ent_html = doc.displacy(style: 'ent')
11
11
 
12
12
  File.open(File.join(File.dirname(__FILE__), "outputs/test_ent.html"), "w") do |file|
13
13
  file.write(ent_html)
@@ -5,7 +5,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
5
5
 
6
6
  doc = nlp.read("bright red apples on the tree")
7
7
 
8
- puts "Text: " + doc
8
+ puts "Text: " + doc.text
9
9
 
10
10
  puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
11
11
  puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  # Doc 1: I like salty fries and hamburgers.
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  span1 = doc1.span(2, 2) # salty fries
data/lib/ruby-spacy.rb CHANGED
@@ -3,12 +3,34 @@
3
3
  require_relative "ruby-spacy/version"
4
4
  require 'enumerator'
5
5
  require 'strscan'
6
- require 'pycall/import'
7
6
  require 'numpy'
7
+ require 'pycall/import'
8
8
  include PyCall::Import
9
9
 
10
10
  # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
11
11
  module Spacy
12
+
13
+ extend PyCall::Import
14
+ spacy = PyCall.import_module('spacy')
15
+
16
+ # Python `Language` class
17
+ PyLanguage = spacy.language.Language
18
+
19
+ # Python `Doc` class object
20
+ PyDoc = spacy.tokens.Doc
21
+
22
+ # Python `Span` class object
23
+ PySpan = spacy.tokens.Span
24
+
25
+ # Python `Token` class object
26
+ PyToken = spacy.tokens.Token
27
+
28
+ # Python `Matcher` class object
29
+ PyMatcher = spacy.matcher.Matcher
30
+
31
+ # Python `displacy` object
32
+ PyDisplacy = spacy.displacy
33
+
12
34
  # A utility module method to convert Python's generator object to a Ruby array,
13
35
  # mainly used on the items inside the array returned from dependency-related methods
14
36
  # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
@@ -16,12 +38,303 @@ module Spacy
16
38
  PyCall::List.(py_generator)
17
39
  end
18
40
 
41
+ # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
42
+ class Doc
43
+
44
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
45
+ attr_reader :py_nlp
46
+
47
+ # @return [Object] a Python `Doc` instance accessible via `PyCall`
48
+ attr_reader :py_doc
49
+
50
+ # @return [String] a text string of the document
51
+ attr_reader :text
52
+
53
+ include Enumerable
54
+
55
+ alias_method :length, :count
56
+ alias_method :len, :count
57
+ alias_method :size, :count
58
+
59
+ # It is recommended to use {Language#read} method to create a doc. If you need to
60
+ # create one using {Doc#initialize}, there are two method signatures:
61
+ # `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
62
+ # @param nlp [Language] an instance of {Language} class
63
+ # @param py_doc [Object] an instance of Python `Doc` class
64
+ # @param text [String] the text string to be analyzed
65
+ def initialize(nlp, py_doc: nil, text: nil)
66
+ @py_nlp = nlp
67
+ if py_doc
68
+ @py_doc = py_doc
69
+ else
70
+ @py_doc = nlp.(text)
71
+ end
72
+ @text = @py_doc.text
73
+ end
74
+
75
+ # Retokenizes the text merging a span into a single token.
76
+ # @param start_index [Integer] the start position of the span to be retokenized in the document
77
+ # @param end_index [Integer] the end position of the span to be retokenized in the document
78
+ # @param attributes [Hash] attributes to set on the merged token
79
+ def retokenize(start_index, end_index, attributes = {})
80
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
81
+ retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
82
+ end
83
+ end
84
+
85
+ # Retokenizes the text splitting the specified token.
86
+ # @param pos_in_doc [Integer] the position of the span to be retokenized in the document
87
+ # @param split_array [Array<String>] text strings of the split results
88
+ # @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
89
+ # @param attributes [Hash] the attributes of the split elements
90
+ def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
91
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
92
+ heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
93
+ retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
94
+ end
95
+ end
96
+
97
+ # String representation of the document.
98
+ # @return [String]
99
+ def to_s
100
+ @text
101
+ end
102
+
103
+ # Returns an array of tokens contained in the doc.
104
+ # @return [Array<Token>]
105
+ def tokens
106
+ results = []
107
+ PyCall::List.(@py_doc).each do |py_token|
108
+ results << Token.new(py_token)
109
+ end
110
+ results
111
+ end
112
+
113
+ # Iterates over the elements in the doc yielding a token instance each time.
114
+ def each
115
+ PyCall::List.(@py_doc).each do |py_token|
116
+ yield Token.new(py_token)
117
+ end
118
+ end
119
+
120
+ # Returns a span of the specified range within the doc.
121
+ # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
122
+ # @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
123
+ # @param optional_size [Integer] an integer representing the size of the span
124
+ # @return [Span]
125
+ def span(range_or_start, optional_size = nil)
126
+ if optional_size
127
+ start_index = range_or_start
128
+ temp = tokens[start_index ... start_index + optional_size]
129
+ else
130
+ start_index = range_or_start.first
131
+ range = range_or_start
132
+ temp = tokens[range]
133
+ end
134
+
135
+ end_index = start_index + temp.size - 1
136
+
137
+ Span.new(self, start_index: start_index, end_index: end_index)
138
+ end
139
+
140
+ # Returns an array of spans representing noun chunks.
141
+ # @return [Array<Span>]
142
+ def noun_chunks
143
+ chunk_array = []
144
+ py_chunks = PyCall::List.(@py_doc.noun_chunks)
145
+ py_chunks.each do |py_chunk|
146
+ chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
147
+ end
148
+ chunk_array
149
+ end
150
+
151
+ # Returns an array of spans each representing a sentence.
152
+ # @return [Array<Span>]
153
+ def sents
154
+ sentence_array = []
155
+ py_sentences = PyCall::List.(@py_doc.sents)
156
+ py_sentences.each do |py_sent|
157
+ sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
158
+ end
159
+ sentence_array
160
+ end
161
+
162
+ # Returns an array of spans each representing a named entity.
163
+ # @return [Array<Span>]
164
+ def ents
165
+ # so that ents canbe "each"-ed in Ruby
166
+ ent_array = []
167
+ PyCall::List.(@py_doc.ents).each do |ent|
168
+ ent_array << ent
169
+ end
170
+ ent_array
171
+ end
172
+
173
+ # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
174
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
175
+ def [](range)
176
+ if range.is_a?(Range)
177
+ py_span = @py_doc[range]
178
+ return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
179
+ else
180
+ return Token.new(@py_doc[range])
181
+ end
182
+ end
183
+
184
+ # Returns a semantic similarity estimate.
185
+ # @param other [Doc] the other doc to which a similarity estimation is made
186
+ # @return [Float]
187
+ def similarity(other)
188
+ py_doc.similarity(other.py_doc)
189
+ end
190
+
191
+ # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
192
+ # @param style [String] either `dep` or `ent`
193
+ # @param compact [Boolean] only relevant to the `dep' style
194
+ # @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
195
+ def displacy(style: "dep", compact: false)
196
+ PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
197
+ end
198
+
199
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
200
+ def method_missing(name, *args)
201
+ @py_doc.send(name, *args)
202
+ end
203
+ end
204
+
205
+ # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
206
+ class Language
207
+
208
+ # @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
209
+ attr_reader :spacy_nlp_id
210
+
211
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
212
+ attr_reader :py_nlp
213
+
214
+ # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
215
+ # @param model [String] A language model installed in the system
216
+ def initialize(model = "en_core_web_sm")
217
+ @spacy_nlp_id = "nlp_#{model.object_id}"
218
+ PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
219
+ @py_nlp = PyCall.eval(@spacy_nlp_id)
220
+ end
221
+
222
+ # Reads and analyze the given text.
223
+ # @param text [String] a text to be read and analyzed
224
+ def read(text)
225
+ Doc.new(py_nlp, text: text)
226
+ end
227
+
228
+ # Generates a matcher for the current language model.
229
+ # @return [Matcher]
230
+ def matcher
231
+ Matcher.new(@py_nlp)
232
+ end
233
+
234
+ # A utility method to lookup a vocabulary item of the given id.
235
+ # @param id [Integer] a vocabulary id
236
+ # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
237
+ def vocab_string_lookup(id)
238
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
239
+ end
240
+
241
+ # A utility method to list pipeline components.
242
+ # @return [Array<String>] An array of text strings representing pipeline components
243
+ def pipe_names
244
+ pipe_array = []
245
+ PyCall::List.(@py_nlp.pipe_names).each do |pipe|
246
+ pipe_array << pipe
247
+ end
248
+ pipe_array
249
+ end
250
+
251
+ # A utility method to get a Python `Lexeme` object.
252
+ # @param text [String] A text string representing a lexeme
253
+ # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
254
+ def get_lexeme(text)
255
+ text = text.gsub("'", "\'")
256
+ @py_nlp.vocab[text]
257
+ end
258
+
259
+ # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
260
+ # @param vector [Object] A vector representation of a word (whether existing or non-existing)
261
+ # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
262
+ def most_similar(vector, n)
263
+ vec_array = Numpy.asarray([vector])
264
+ py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
265
+ key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
266
+ keys = key_texts.map{|kt| kt[0]}
267
+ texts = key_texts.map{|kt| kt[1]}
268
+ best_rows = PyCall::List.(py_result[1])[0]
269
+ scores = PyCall::List.(py_result[2])[0]
270
+
271
+ results = []
272
+ n.times do |i|
273
+ results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
274
+ end
275
+ results
276
+ end
277
+
278
+ # Utility function to batch process many texts
279
+ # @param texts [String]
280
+ # @param disable [Array<String>]
281
+ # @param batch_size [Integer]
282
+ # @return [Array<Doc>]
283
+ def pipe(texts, disable: [], batch_size: 50)
284
+ docs = []
285
+ PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
286
+ docs << Doc.new(@py_nlp, py_doc: py_doc)
287
+ end
288
+ docs
289
+ end
290
+
291
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
292
+ def method_missing(name, *args)
293
+ @py_nlp.send(name, *args)
294
+ end
295
+ end
296
+
297
+ # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
298
+ class Matcher
299
+
300
+ # @return [Object] a Python `Matcher` instance accessible via `PyCall`
301
+ attr_reader :py_matcher
302
+
303
+ # Creates a {Matcher} instance
304
+ # @param nlp [Language] an instance of {Language} class
305
+ def initialize(nlp)
306
+ @py_matcher = PyMatcher.(nlp.vocab)
307
+ end
308
+
309
+ # Adds a label string and a text pattern.
310
+ # @param text [String] a label string given to the pattern
311
+ # @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
312
+ def add(text, pattern)
313
+ @py_matcher.add(text, pattern)
314
+ end
315
+
316
+ # Execute the match.
317
+ # @param doc [Doc] an {Doc} instance
318
+ # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
319
+ def match(doc)
320
+ str_results = @py_matcher.(doc.py_doc).to_s
321
+ s = StringScanner.new(str_results[1..-2])
322
+ results = []
323
+ while s.scan_until(/(\d+), (\d+), (\d+)/)
324
+ next unless s.matched
325
+ triple = s.matched.split(", ")
326
+ match_id = triple[0].to_i
327
+ start_index = triple[1].to_i
328
+ end_index = triple[2].to_i - 1
329
+ results << {match_id: match_id, start_index: start_index, end_index: end_index}
330
+ end
331
+ results
332
+ end
333
+ end
334
+
19
335
  # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
20
336
  class Span
21
337
 
22
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
23
- attr_reader :spacy_span_id
24
-
25
338
  # @return [Object] a Python `Span` instance accessible via `PyCall`
26
339
  attr_reader :py_span
27
340
 
@@ -35,21 +348,18 @@ module Spacy
35
348
  alias_method :size, :count
36
349
 
37
350
  # It is recommended to use {Doc#span} method to create a span. If you need to
38
- # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
351
+ # create one using {Span#initialize}, there are two method signatures:
352
+ # `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
39
353
  # @param doc [Doc] the document to which this span belongs to
40
354
  # @param start_index [Integer] the index of the item starting the span inside a doc
41
355
  # @param end_index [Integer] the index of the item ending the span inside a doc
42
356
  # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
43
357
  def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
44
358
  @doc = doc
45
- @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
46
359
  if py_span
47
360
  @py_span = py_span
48
361
  else
49
- options = PyCall::Dict.(options)
50
- PyCall.exec("#{@spacy_span_id}_opts = #{options}")
51
- PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
52
- @py_span = PyCall.eval(@spacy_span_id)
362
+ @py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
53
363
  end
54
364
  end
55
365
 
@@ -63,7 +373,7 @@ module Spacy
63
373
  results
64
374
  end
65
375
 
66
- # Iterates over the elements in the span yielding a token instance.
376
+ # Iterates over the elements in the span yielding a token instance each time.
67
377
  def each
68
378
  PyCall::List.(@py_span).each do |py_token|
69
379
  yield Token.new(py_token)
@@ -97,7 +407,6 @@ module Spacy
97
407
  def ents
98
408
  ent_array = []
99
409
  PyCall::List.(@py_span.ents).each do |py_span|
100
- # ent_array << ent
101
410
  ent_array << Spacy::Span.new(@doc, py_span: py_span)
102
411
  end
103
412
  ent_array
@@ -106,11 +415,11 @@ module Spacy
106
415
  # Returns a span that represents the sentence that the given span is part of.
107
416
  # @return [Span]
108
417
  def sent
109
- py_span =@py_span.sent
418
+ py_span = @py_span.sent
110
419
  return Spacy::Span.new(@doc, py_span: py_span)
111
420
  end
112
421
 
113
- # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
422
+ # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
114
423
  # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
115
424
  def [](range)
116
425
  if range.is_a?(Range)
@@ -125,16 +434,16 @@ module Spacy
125
434
  # @param other [Span] the other span to which a similarity estimation is conducted
126
435
  # @return [Float]
127
436
  def similarity(other)
128
- PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
437
+ py_span.similarity(other.py_span)
129
438
  end
130
439
 
131
- # Creates a document instance
440
+ # Creates a document instance from the span
132
441
  # @return [Doc]
133
442
  def as_doc
134
- Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
443
+ Spacy::Doc.new(@doc.py_nlp, text: self.text)
135
444
  end
136
445
 
137
- # Returns Tokens conjugated to the root of the span.
446
+ # Returns tokens conjugated to the root of the span.
138
447
  # @return [Array<Token>] an array of tokens
139
448
  def conjuncts
140
449
  conjunct_array = []
@@ -144,7 +453,7 @@ module Spacy
144
453
  conjunct_array
145
454
  end
146
455
 
147
- # Returns Tokens that are to the left of the span, whose heads are within the span.
456
+ # Returns tokens that are to the left of the span, whose heads are within the span.
148
457
  # @return [Array<Token>] an array of tokens
149
458
  def lefts
150
459
  left_array = []
@@ -189,7 +498,8 @@ module Spacy
189
498
  # @return [String] a string representing the token
190
499
  attr_reader :text
191
500
 
192
- # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
501
+ # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
502
+ # There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
193
503
  # @param py_token [Object] Python `Token` object
194
504
  def initialize(py_token)
195
505
  @py_token = py_token
@@ -253,7 +563,7 @@ module Spacy
253
563
  end
254
564
 
255
565
  # Returns a hash or string of morphological information
256
- # @param dict [Boolean] if true, a hash will be returned instead of a string
566
+ # @param hash [Boolean] if true, a hash will be returned instead of a string
257
567
  # @return [Hash, String]
258
568
  def morphology(hash = true)
259
569
  if @py_token.has_morph
@@ -278,310 +588,6 @@ module Spacy
278
588
  end
279
589
  end
280
590
 
281
- # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
282
- class Doc
283
-
284
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
285
- attr_reader :spacy_nlp_id
286
-
287
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
288
- attr_reader :spacy_doc_id
289
-
290
- # @return [Object] a Python `Doc` instance accessible via `PyCall`
291
- attr_reader :py_doc
292
-
293
- # @return [String] a text string of the document
294
- attr_reader :text
295
-
296
- include Enumerable
297
-
298
- alias_method :length, :count
299
- alias_method :len, :count
300
- alias_method :size, :count
301
-
302
- # Creates a new instance of {Doc}.
303
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
304
- # @param text [String] The text string to be analyzed
305
- def initialize(nlp_id, text)
306
- @text = text
307
- @spacy_nlp_id = nlp_id
308
- @spacy_doc_id = "doc_#{text.object_id}"
309
- quoted = text.gsub('"', '\"')
310
- PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
311
- PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
312
- @py_doc = PyCall.eval(@spacy_doc_id)
313
- end
314
-
315
-
316
- # Retokenizes the text merging a span into a single token.
317
- # @param start_index [Integer] The start position of the span to be retokenized in the document
318
- # @param end_index [Integer] The end position of the span to be retokenized in the document
319
- # @param attributes [Hash] Attributes to set on the merged token
320
- def retokenize(start_index, end_index, attributes = {})
321
- py_attrs = PyCall::Dict.(attributes)
322
- PyCall.exec(<<PY)
323
- with #{@spacy_doc_id}.retokenize() as retokenizer:
324
- retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
325
- PY
326
- @py_doc = PyCall.eval(@spacy_doc_id)
327
- end
328
-
329
- # Retokenizes the text splitting the specified token.
330
- # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
331
- # @param split_array [Array<String>] text strings of the split results
332
- # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
333
- # @param attributes [Hash] The attributes of the split elements
334
- def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
335
- py_attrs = PyCall::Dict.(attributes)
336
- py_split_array = PyCall::List.(split_array)
337
- PyCall.exec(<<PY)
338
- with #{@spacy_doc_id}.retokenize() as retokenizer:
339
- heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
340
- attrs = #{py_attrs}
341
- split_array = #{py_split_array}
342
- retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
343
- PY
344
- @py_doc = PyCall.eval(@spacy_doc_id)
345
- end
346
-
347
- # String representation of the token.
348
- # @return [String]
349
- def to_s
350
- @text
351
- end
352
-
353
- # Returns an array of tokens contained in the doc.
354
- # @return [Array<Token>]
355
- def tokens
356
- results = []
357
- PyCall::List.(@py_doc).each do |py_token|
358
- results << Token.new(py_token)
359
- end
360
- results
361
- end
362
-
363
- # Iterates over the elements in the doc yielding a token instance.
364
- def each
365
- PyCall::List.(@py_doc).each do |py_token|
366
- yield Token.new(py_token)
367
- end
368
- end
369
-
370
- # Returns a span of the specified range within the doc.
371
- # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
372
- # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
373
- # @param optional_size [Integer] An integer representing the size of the span
374
- # @return [Span]
375
- def span(range_or_start, optional_size = nil)
376
- if optional_size
377
- start_index = range_or_start
378
- temp = tokens[start_index ... start_index + optional_size]
379
- else
380
- start_index = range_or_start.first
381
- range = range_or_start
382
- temp = tokens[range]
383
- end
384
-
385
- end_index = start_index + temp.size - 1
386
-
387
- Span.new(self, start_index: start_index, end_index: end_index)
388
- end
389
-
390
- # Returns an array of spans representing noun chunks.
391
- # @return [Array<Span>]
392
- def noun_chunks
393
- chunk_array = []
394
- py_chunks = PyCall::List.(@py_doc.noun_chunks)
395
- py_chunks.each do |py_chunk|
396
- chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
397
- end
398
- chunk_array
399
- end
400
-
401
- # Returns an array of spans representing sentences.
402
- # @return [Array<Span>]
403
- def sents
404
- sentence_array = []
405
- py_sentences = PyCall::List.(@py_doc.sents)
406
- py_sentences.each do |py_sent|
407
- sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
408
- end
409
- sentence_array
410
- end
411
-
412
- # Returns an array of spans representing named entities.
413
- # @return [Array<Span>]
414
- def ents
415
- # so that ents canbe "each"-ed in Ruby
416
- ent_array = []
417
- PyCall::List.(@py_doc.ents).each do |ent|
418
- ent_array << ent
419
- end
420
- ent_array
421
- end
422
-
423
- # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
424
- # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
425
- def [](range)
426
- if range.is_a?(Range)
427
- py_span = @py_doc[range]
428
- return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
429
- else
430
- return Token.new(@py_doc[range])
431
- end
432
- end
433
-
434
- # Returns a semantic similarity estimate.
435
- # @param other [Doc] the other doc to which a similarity estimation is made
436
- # @return [Float]
437
- def similarity(other)
438
- PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
439
- end
440
-
441
- # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
442
- # @param style [String] Either `dep` or `ent`
443
- # @param compact [Boolean] Only relevant to the `dep' style
444
- # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
445
- def displacy(style: "dep", compact: false)
446
- PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
447
- end
448
-
449
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
450
- def method_missing(name, *args)
451
- @py_doc.send(name, *args)
452
- end
453
- end
454
-
455
- # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
456
- class Matcher
457
-
458
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
459
- attr_reader :spacy_matcher_id
460
-
461
- # @return [Object] a Python `Matcher` instance accessible via `PyCall`
462
- attr_reader :py_matcher
463
-
464
- # Creates a {Matcher} instance
465
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
466
- def initialize(nlp_id)
467
- @spacy_matcher_id = "doc_#{nlp_id}_matcher"
468
- PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
469
- @py_matcher = PyCall.eval(@spacy_matcher_id)
470
- end
471
-
472
- # Adds a label string and a text pattern.
473
- # @param text [String] a label string given to the pattern
474
- # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
475
- def add(text, pattern)
476
- @py_matcher.add(text, pattern)
477
- end
478
-
479
- # Execute the match.
480
- # @param doc [Doc] An {Doc} instance
481
- # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
482
- def match(doc)
483
- str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
484
- s = StringScanner.new(str_results[1..-2])
485
- results = []
486
- while s.scan_until(/(\d+), (\d+), (\d+)/)
487
- next unless s.matched
488
- triple = s.matched.split(", ")
489
- match_id = triple[0].to_i
490
- start_index = triple[1].to_i
491
- end_index = triple[2].to_i - 1
492
- results << {match_id: match_id, start_index: start_index, end_index: end_index}
493
- end
494
- results
495
- end
496
- end
497
-
498
- # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
499
- class Language
500
-
501
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
502
- attr_reader :spacy_nlp_id
503
-
504
- # @return [Object] a Python `Language` instance accessible via `PyCall`
505
- attr_reader :py_nlp
506
-
507
- # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
508
- # @param model [String] A language model installed in the system
509
- def initialize(model = "en_core_web_sm")
510
- @spacy_nlp_id = "nlp_#{model.object_id}"
511
- PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
512
- PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
513
- @py_nlp = PyCall.eval(@spacy_nlp_id)
514
- end
515
-
516
- # Reads and analyze the given text.
517
- # @param text [String] A text to be read and analyzed
518
- def read(text)
519
- Doc.new(@spacy_nlp_id, text)
520
- end
521
-
522
- # Generates a matcher for the current language model.
523
- # @return [Matcher]
524
- def matcher
525
- Matcher.new(@spacy_nlp_id)
526
- end
527
-
528
- # A utility method to lookup a vocabulary item of the given id.
529
- # @param id [Integer] A vocabulary id
530
- # @return [Object] A Python `Lexeme` object
531
- def vocab_string_lookup(id)
532
- PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
533
- end
534
-
535
- # A utility method to list pipeline components.
536
- # @return [Array<String>] An array of text strings representing pipeline components
537
- def pipe_names
538
- pipe_array = []
539
- PyCall::List.(@py_nlp.pipe_names).each do |pipe|
540
- pipe_array << pipe
541
- end
542
- pipe_array
543
- end
544
-
545
- # A utility method to get the tokenizer Python object.
546
- # @return [Object] Python `Tokenizer` object
547
- def tokenizer
548
- return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
549
- end
550
-
551
- # A utility method to get a Python `Lexeme` object.
552
- # @param text [String] A text string representing a lexeme
553
- # @return [Object] Python `Tokenizer` object
554
- def get_lexeme(text)
555
- text = text.gsub("'", "\'")
556
- py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
557
- return py_lexeme
558
- end
559
-
560
- # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
561
- # @param vector [Object] A vector representation of a word (whether existing or non-existing)
562
- # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
563
- def most_similar(vector, n)
564
- vec_array = Numpy.asarray([vector])
565
- py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
566
- key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
567
- keys = key_texts.map{|kt| kt[0]}
568
- texts = key_texts.map{|kt| kt[1]}
569
- best_rows = PyCall::List.(py_result[1])[0]
570
- scores = PyCall::List.(py_result[2])[0]
571
-
572
- results = []
573
- n.times do |i|
574
- results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
575
- end
576
-
577
- results
578
- end
579
-
580
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
581
- def method_missing(name, *args)
582
- @py_nlp.send(name, *args)
583
- end
584
- end
585
591
 
586
592
  end
587
593
 
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.1.2"
5
+ VERSION = "0.1.3"
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-26 00:00:00.000000000 Z
11
+ date: 2021-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pycall
@@ -66,6 +66,7 @@ extra_rdoc_files: []
66
66
  files:
67
67
  - ".gitignore"
68
68
  - ".yardopts"
69
+ - CHANGELOG.md
69
70
  - Gemfile
70
71
  - Gemfile.lock
71
72
  - LICENSE.txt
@@ -123,7 +124,6 @@ files:
123
124
  - examples/linguistic_features/sentence_segmentation.rb
124
125
  - examples/linguistic_features/similarity.rb
125
126
  - examples/linguistic_features/similarity_between_spans.rb
126
- - examples/linguistic_features/special_case_tokenization_rules.rb
127
127
  - examples/linguistic_features/tokenization.rb
128
128
  - examples/rule_based_matching/creating_spans_from_matches.rb
129
129
  - examples/rule_based_matching/matcher.rb
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
149
  - !ruby/object:Gem::Version
150
150
  version: '0'
151
151
  requirements: []
152
- rubygems_version: 3.2.11
152
+ rubygems_version: 3.2.3
153
153
  signing_key:
154
154
  specification_version: 4
155
155
  summary: A wrapper module for using spaCy natural language processing library from
@@ -1,19 +0,0 @@
1
- require "ruby-spacy"
2
- require "terminal-table"
3
-
4
- nlp = Spacy::Language.new("en_core_web_sm")
5
-
6
- doc = nlp.read("gimme that")
7
-
8
- puts doc.tokens.join(" ")
9
-
10
- # Add special case rule
11
- special_case = [{ORTH: "gim"}, {ORTH: "me"}]
12
- tokenizer = nlp.tokenizer
13
- tokenizer.add_special_case("gimme", special_case)
14
-
15
- # Check new tokenization
16
- puts nlp.read("gimme that").tokens.join(" ")
17
-
18
- # gimme that
19
- # gim me that