ruby-spacy 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9add9d3b065bbf5064652cb115f824221d929a20478d182782df5db564cc8f45
4
- data.tar.gz: f07d502f79883a452e7f250f0fe784425511a0de4f8a43db0b29ca03801bd755
3
+ metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
4
+ data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
5
5
  SHA512:
6
- metadata.gz: 373c795a148034f4191cfaf130a23f464dc2b43927bf6aa3165999c78797365ce2f976021ea8b9ab1dd083736e5f9a1da51a5ccf0156d00ec39dac9fd19bde7c
7
- data.tar.gz: e370e503c23d15a0a44be84bf578775b0a4acc5557468c7fc9468cde44e0e084018be8dc17c3e7c21d9efdaf229611ca234614fcd2e811272051c7c2922b408d
6
+ metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
7
+ data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c
data/CHANGELOG.md ADDED
@@ -0,0 +1,14 @@
1
+ # Change Log
2
+
3
+ ## 0.1.3 - 2021-06-26
4
+ - Code cleanup
5
+
6
+ ## 0.1.2 - 2021-06-26
7
+ ### Added
8
+ - `Spacy::Token#morpheme` method
9
+
10
+ ## 0.1.1 - 2021-06-26
11
+ - Project description fixed
12
+
13
+ ## 0.1.0 - 2021-06-26
14
+ - Initial release
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby-spacy (0.1.2)
4
+ ruby-spacy (0.1.3)
5
5
  numpy (~> 0.4.0)
6
6
  pycall (~> 1.4.0)
7
7
  terminal-table (~> 3.0.1)
@@ -2,7 +2,7 @@ require "ruby-spacy"
2
2
  require "terminal-table"
3
3
 
4
4
  nlp = Spacy::Language.new("en_core_web_sm")
5
- doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
5
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion.")
6
6
 
7
7
  headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
@@ -4,8 +4,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
4
4
  doc1 = nlp.read("I like salty fries and hamburgers.")
5
5
  doc2 = nlp.read("Fast food tastes very good.")
6
6
 
7
- puts "Doc 1: " + doc1
8
- puts "Doc 2: " + doc2
7
+ puts "Doc 1: " + doc1.text
8
+ puts "Doc 2: " + doc2.text
9
9
  puts "Similarity: #{doc1.similarity(doc2)}"
10
10
 
11
11
  # Doc 1: I like salty fries and hamburgers.
@@ -6,8 +6,8 @@ nlp = Spacy::Language.new("ja_core_news_sm")
6
6
  sentence = "自動運転車は保険責任を製造者に転嫁する。"
7
7
  doc = nlp.read(sentence)
8
8
 
9
- dep_svg = doc.displacy('dep', false)
9
+ dep_svg = doc.displacy(style: 'dep', compact: false)
10
10
 
11
- File.open(File.join(File.dirname(__FILE__), "outputs/test_dep.svg"), "w") do |file|
11
+ File.open(File.join(File.dirname(__FILE__), "test_dep.svg"), "w") do |file|
12
12
  file.write(dep_svg)
13
13
  end
@@ -7,7 +7,7 @@ sentence ="セバスチアン・スランが2007年にグーグルで自動運
7
7
 
8
8
  doc = nlp.read(sentence)
9
9
 
10
- ent_html = doc.displacy('ent')
10
+ ent_html = doc.displacy(style: 'ent')
11
11
 
12
12
  File.open(File.join(File.dirname(__FILE__), "outputs/test_ent.html"), "w") do |file|
13
13
  file.write(ent_html)
@@ -5,7 +5,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
5
5
 
6
6
  doc = nlp.read("bright red apples on the tree")
7
7
 
8
- puts "Text: " + doc
8
+ puts "Text: " + doc.text
9
9
 
10
10
  puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
11
11
  puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  # Doc 1: I like salty fries and hamburgers.
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  span1 = doc1.span(2, 2) # salty fries
data/lib/ruby-spacy.rb CHANGED
@@ -3,12 +3,34 @@
3
3
  require_relative "ruby-spacy/version"
4
4
  require 'enumerator'
5
5
  require 'strscan'
6
- require 'pycall/import'
7
6
  require 'numpy'
7
+ require 'pycall/import'
8
8
  include PyCall::Import
9
9
 
10
10
  # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
11
11
  module Spacy
12
+
13
+ extend PyCall::Import
14
+ spacy = PyCall.import_module('spacy')
15
+
16
+ # Python `Language` class
17
+ PyLanguage = spacy.language.Language
18
+
19
+ # Python `Doc` class object
20
+ PyDoc = spacy.tokens.Doc
21
+
22
+ # Python `Span` class object
23
+ PySpan = spacy.tokens.Span
24
+
25
+ # Python `Token` class object
26
+ PyToken = spacy.tokens.Token
27
+
28
+ # Python `Matcher` class object
29
+ PyMatcher = spacy.matcher.Matcher
30
+
31
+ # Python `displacy` object
32
+ PyDisplacy = spacy.displacy
33
+
12
34
  # A utility module method to convert Python's generator object to a Ruby array,
13
35
  # mainly used on the items inside the array returned from dependency-related methods
14
36
  # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
@@ -16,12 +38,303 @@ module Spacy
16
38
  PyCall::List.(py_generator)
17
39
  end
18
40
 
41
+ # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
42
+ class Doc
43
+
44
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
45
+ attr_reader :py_nlp
46
+
47
+ # @return [Object] a Python `Doc` instance accessible via `PyCall`
48
+ attr_reader :py_doc
49
+
50
+ # @return [String] a text string of the document
51
+ attr_reader :text
52
+
53
+ include Enumerable
54
+
55
+ alias_method :length, :count
56
+ alias_method :len, :count
57
+ alias_method :size, :count
58
+
59
+ # It is recommended to use {Language#read} method to create a doc. If you need to
60
+ # create one using {Doc#initialize}, there are two method signatures:
61
+ # `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
62
+ # @param nlp [Language] an instance of {Language} class
63
+ # @param py_doc [Object] an instance of Python `Doc` class
64
+ # @param text [String] the text string to be analyzed
65
+ def initialize(nlp, py_doc: nil, text: nil)
66
+ @py_nlp = nlp
67
+ if py_doc
68
+ @py_doc = py_doc
69
+ else
70
+ @py_doc = nlp.(text)
71
+ end
72
+ @text = @py_doc.text
73
+ end
74
+
75
+ # Retokenizes the text merging a span into a single token.
76
+ # @param start_index [Integer] the start position of the span to be retokenized in the document
77
+ # @param end_index [Integer] the end position of the span to be retokenized in the document
78
+ # @param attributes [Hash] attributes to set on the merged token
79
+ def retokenize(start_index, end_index, attributes = {})
80
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
81
+ retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
82
+ end
83
+ end
84
+
85
+ # Retokenizes the text splitting the specified token.
86
+ # @param pos_in_doc [Integer] the position of the span to be retokenized in the document
87
+ # @param split_array [Array<String>] text strings of the split results
88
+ # @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
89
+ # @param attributes [Hash] the attributes of the split elements
90
+ def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
91
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
92
+ heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
93
+ retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
94
+ end
95
+ end
96
+
97
+ # String representation of the document.
98
+ # @return [String]
99
+ def to_s
100
+ @text
101
+ end
102
+
103
+ # Returns an array of tokens contained in the doc.
104
+ # @return [Array<Token>]
105
+ def tokens
106
+ results = []
107
+ PyCall::List.(@py_doc).each do |py_token|
108
+ results << Token.new(py_token)
109
+ end
110
+ results
111
+ end
112
+
113
+ # Iterates over the elements in the doc yielding a token instance each time.
114
+ def each
115
+ PyCall::List.(@py_doc).each do |py_token|
116
+ yield Token.new(py_token)
117
+ end
118
+ end
119
+
120
+ # Returns a span of the specified range within the doc.
121
+ # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
122
+ # @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
123
+ # @param optional_size [Integer] an integer representing the size of the span
124
+ # @return [Span]
125
+ def span(range_or_start, optional_size = nil)
126
+ if optional_size
127
+ start_index = range_or_start
128
+ temp = tokens[start_index ... start_index + optional_size]
129
+ else
130
+ start_index = range_or_start.first
131
+ range = range_or_start
132
+ temp = tokens[range]
133
+ end
134
+
135
+ end_index = start_index + temp.size - 1
136
+
137
+ Span.new(self, start_index: start_index, end_index: end_index)
138
+ end
139
+
140
+ # Returns an array of spans representing noun chunks.
141
+ # @return [Array<Span>]
142
+ def noun_chunks
143
+ chunk_array = []
144
+ py_chunks = PyCall::List.(@py_doc.noun_chunks)
145
+ py_chunks.each do |py_chunk|
146
+ chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
147
+ end
148
+ chunk_array
149
+ end
150
+
151
+ # Returns an array of spans each representing a sentence.
152
+ # @return [Array<Span>]
153
+ def sents
154
+ sentence_array = []
155
+ py_sentences = PyCall::List.(@py_doc.sents)
156
+ py_sentences.each do |py_sent|
157
+ sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
158
+ end
159
+ sentence_array
160
+ end
161
+
162
+ # Returns an array of spans each representing a named entity.
163
+ # @return [Array<Span>]
164
+ def ents
165
+ # so that ents canbe "each"-ed in Ruby
166
+ ent_array = []
167
+ PyCall::List.(@py_doc.ents).each do |ent|
168
+ ent_array << ent
169
+ end
170
+ ent_array
171
+ end
172
+
173
+ # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
174
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
175
+ def [](range)
176
+ if range.is_a?(Range)
177
+ py_span = @py_doc[range]
178
+ return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
179
+ else
180
+ return Token.new(@py_doc[range])
181
+ end
182
+ end
183
+
184
+ # Returns a semantic similarity estimate.
185
+ # @param other [Doc] the other doc to which a similarity estimation is made
186
+ # @return [Float]
187
+ def similarity(other)
188
+ py_doc.similarity(other.py_doc)
189
+ end
190
+
191
+ # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
192
+ # @param style [String] either `dep` or `ent`
193
+ # @param compact [Boolean] only relevant to the `dep' style
194
+ # @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
195
+ def displacy(style: "dep", compact: false)
196
+ PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
197
+ end
198
+
199
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
200
+ def method_missing(name, *args)
201
+ @py_doc.send(name, *args)
202
+ end
203
+ end
204
+
205
+ # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
206
+ class Language
207
+
208
+ # @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
209
+ attr_reader :spacy_nlp_id
210
+
211
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
212
+ attr_reader :py_nlp
213
+
214
+ # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
215
+ # @param model [String] A language model installed in the system
216
+ def initialize(model = "en_core_web_sm")
217
+ @spacy_nlp_id = "nlp_#{model.object_id}"
218
+ PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
219
+ @py_nlp = PyCall.eval(@spacy_nlp_id)
220
+ end
221
+
222
+ # Reads and analyze the given text.
223
+ # @param text [String] a text to be read and analyzed
224
+ def read(text)
225
+ Doc.new(py_nlp, text: text)
226
+ end
227
+
228
+ # Generates a matcher for the current language model.
229
+ # @return [Matcher]
230
+ def matcher
231
+ Matcher.new(@py_nlp)
232
+ end
233
+
234
+ # A utility method to lookup a vocabulary item of the given id.
235
+ # @param id [Integer] a vocabulary id
236
+ # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
237
+ def vocab_string_lookup(id)
238
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
239
+ end
240
+
241
+ # A utility method to list pipeline components.
242
+ # @return [Array<String>] An array of text strings representing pipeline components
243
+ def pipe_names
244
+ pipe_array = []
245
+ PyCall::List.(@py_nlp.pipe_names).each do |pipe|
246
+ pipe_array << pipe
247
+ end
248
+ pipe_array
249
+ end
250
+
251
+ # A utility method to get a Python `Lexeme` object.
252
+ # @param text [String] A text string representing a lexeme
253
+ # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
254
+ def get_lexeme(text)
255
+ text = text.gsub("'", "\'")
256
+ @py_nlp.vocab[text]
257
+ end
258
+
259
+ # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
260
+ # @param vector [Object] A vector representation of a word (whether existing or non-existing)
261
+ # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
262
+ def most_similar(vector, n)
263
+ vec_array = Numpy.asarray([vector])
264
+ py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
265
+ key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
266
+ keys = key_texts.map{|kt| kt[0]}
267
+ texts = key_texts.map{|kt| kt[1]}
268
+ best_rows = PyCall::List.(py_result[1])[0]
269
+ scores = PyCall::List.(py_result[2])[0]
270
+
271
+ results = []
272
+ n.times do |i|
273
+ results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
274
+ end
275
+ results
276
+ end
277
+
278
+ # Utility function to batch process many texts
279
+ # @param texts [String]
280
+ # @param disable [Array<String>]
281
+ # @param batch_size [Integer]
282
+ # @return [Array<Doc>]
283
+ def pipe(texts, disable: [], batch_size: 50)
284
+ docs = []
285
+ PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
286
+ docs << Doc.new(@py_nlp, py_doc: py_doc)
287
+ end
288
+ docs
289
+ end
290
+
291
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
292
+ def method_missing(name, *args)
293
+ @py_nlp.send(name, *args)
294
+ end
295
+ end
296
+
297
+ # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
298
+ class Matcher
299
+
300
+ # @return [Object] a Python `Matcher` instance accessible via `PyCall`
301
+ attr_reader :py_matcher
302
+
303
+ # Creates a {Matcher} instance
304
+ # @param nlp [Language] an instance of {Language} class
305
+ def initialize(nlp)
306
+ @py_matcher = PyMatcher.(nlp.vocab)
307
+ end
308
+
309
+ # Adds a label string and a text pattern.
310
+ # @param text [String] a label string given to the pattern
311
+ # @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
312
+ def add(text, pattern)
313
+ @py_matcher.add(text, pattern)
314
+ end
315
+
316
+ # Execute the match.
317
+ # @param doc [Doc] an {Doc} instance
318
+ # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
319
+ def match(doc)
320
+ str_results = @py_matcher.(doc.py_doc).to_s
321
+ s = StringScanner.new(str_results[1..-2])
322
+ results = []
323
+ while s.scan_until(/(\d+), (\d+), (\d+)/)
324
+ next unless s.matched
325
+ triple = s.matched.split(", ")
326
+ match_id = triple[0].to_i
327
+ start_index = triple[1].to_i
328
+ end_index = triple[2].to_i - 1
329
+ results << {match_id: match_id, start_index: start_index, end_index: end_index}
330
+ end
331
+ results
332
+ end
333
+ end
334
+
19
335
  # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
20
336
  class Span
21
337
 
22
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
23
- attr_reader :spacy_span_id
24
-
25
338
  # @return [Object] a Python `Span` instance accessible via `PyCall`
26
339
  attr_reader :py_span
27
340
 
@@ -35,21 +348,18 @@ module Spacy
35
348
  alias_method :size, :count
36
349
 
37
350
  # It is recommended to use {Doc#span} method to create a span. If you need to
38
- # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
351
+ # create one using {Span#initialize}, there are two method signatures:
352
+ # `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
39
353
  # @param doc [Doc] the document to which this span belongs to
40
354
  # @param start_index [Integer] the index of the item starting the span inside a doc
41
355
  # @param end_index [Integer] the index of the item ending the span inside a doc
42
356
  # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
43
357
  def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
44
358
  @doc = doc
45
- @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
46
359
  if py_span
47
360
  @py_span = py_span
48
361
  else
49
- options = PyCall::Dict.(options)
50
- PyCall.exec("#{@spacy_span_id}_opts = #{options}")
51
- PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
52
- @py_span = PyCall.eval(@spacy_span_id)
362
+ @py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
53
363
  end
54
364
  end
55
365
 
@@ -63,7 +373,7 @@ module Spacy
63
373
  results
64
374
  end
65
375
 
66
- # Iterates over the elements in the span yielding a token instance.
376
+ # Iterates over the elements in the span yielding a token instance each time.
67
377
  def each
68
378
  PyCall::List.(@py_span).each do |py_token|
69
379
  yield Token.new(py_token)
@@ -97,7 +407,6 @@ module Spacy
97
407
  def ents
98
408
  ent_array = []
99
409
  PyCall::List.(@py_span.ents).each do |py_span|
100
- # ent_array << ent
101
410
  ent_array << Spacy::Span.new(@doc, py_span: py_span)
102
411
  end
103
412
  ent_array
@@ -106,11 +415,11 @@ module Spacy
106
415
  # Returns a span that represents the sentence that the given span is part of.
107
416
  # @return [Span]
108
417
  def sent
109
- py_span =@py_span.sent
418
+ py_span = @py_span.sent
110
419
  return Spacy::Span.new(@doc, py_span: py_span)
111
420
  end
112
421
 
113
- # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
422
+ # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
114
423
  # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
115
424
  def [](range)
116
425
  if range.is_a?(Range)
@@ -125,16 +434,16 @@ module Spacy
125
434
  # @param other [Span] the other span to which a similarity estimation is conducted
126
435
  # @return [Float]
127
436
  def similarity(other)
128
- PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
437
+ py_span.similarity(other.py_span)
129
438
  end
130
439
 
131
- # Creates a document instance
440
+ # Creates a document instance from the span
132
441
  # @return [Doc]
133
442
  def as_doc
134
- Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
443
+ Spacy::Doc.new(@doc.py_nlp, text: self.text)
135
444
  end
136
445
 
137
- # Returns Tokens conjugated to the root of the span.
446
+ # Returns tokens conjugated to the root of the span.
138
447
  # @return [Array<Token>] an array of tokens
139
448
  def conjuncts
140
449
  conjunct_array = []
@@ -144,7 +453,7 @@ module Spacy
144
453
  conjunct_array
145
454
  end
146
455
 
147
- # Returns Tokens that are to the left of the span, whose heads are within the span.
456
+ # Returns tokens that are to the left of the span, whose heads are within the span.
148
457
  # @return [Array<Token>] an array of tokens
149
458
  def lefts
150
459
  left_array = []
@@ -189,7 +498,8 @@ module Spacy
189
498
  # @return [String] a string representing the token
190
499
  attr_reader :text
191
500
 
192
- # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
501
+ # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
502
+ # There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
193
503
  # @param py_token [Object] Python `Token` object
194
504
  def initialize(py_token)
195
505
  @py_token = py_token
@@ -253,7 +563,7 @@ module Spacy
253
563
  end
254
564
 
255
565
  # Returns a hash or string of morphological information
256
- # @param dict [Boolean] if true, a hash will be returned instead of a string
566
+ # @param hash [Boolean] if true, a hash will be returned instead of a string
257
567
  # @return [Hash, String]
258
568
  def morphology(hash = true)
259
569
  if @py_token.has_morph
@@ -278,310 +588,6 @@ module Spacy
278
588
  end
279
589
  end
280
590
 
281
- # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
282
- class Doc
283
-
284
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
285
- attr_reader :spacy_nlp_id
286
-
287
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
288
- attr_reader :spacy_doc_id
289
-
290
- # @return [Object] a Python `Doc` instance accessible via `PyCall`
291
- attr_reader :py_doc
292
-
293
- # @return [String] a text string of the document
294
- attr_reader :text
295
-
296
- include Enumerable
297
-
298
- alias_method :length, :count
299
- alias_method :len, :count
300
- alias_method :size, :count
301
-
302
- # Creates a new instance of {Doc}.
303
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
304
- # @param text [String] The text string to be analyzed
305
- def initialize(nlp_id, text)
306
- @text = text
307
- @spacy_nlp_id = nlp_id
308
- @spacy_doc_id = "doc_#{text.object_id}"
309
- quoted = text.gsub('"', '\"')
310
- PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
311
- PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
312
- @py_doc = PyCall.eval(@spacy_doc_id)
313
- end
314
-
315
-
316
- # Retokenizes the text merging a span into a single token.
317
- # @param start_index [Integer] The start position of the span to be retokenized in the document
318
- # @param end_index [Integer] The end position of the span to be retokenized in the document
319
- # @param attributes [Hash] Attributes to set on the merged token
320
- def retokenize(start_index, end_index, attributes = {})
321
- py_attrs = PyCall::Dict.(attributes)
322
- PyCall.exec(<<PY)
323
- with #{@spacy_doc_id}.retokenize() as retokenizer:
324
- retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
325
- PY
326
- @py_doc = PyCall.eval(@spacy_doc_id)
327
- end
328
-
329
- # Retokenizes the text splitting the specified token.
330
- # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
331
- # @param split_array [Array<String>] text strings of the split results
332
- # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
333
- # @param attributes [Hash] The attributes of the split elements
334
- def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
335
- py_attrs = PyCall::Dict.(attributes)
336
- py_split_array = PyCall::List.(split_array)
337
- PyCall.exec(<<PY)
338
- with #{@spacy_doc_id}.retokenize() as retokenizer:
339
- heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
340
- attrs = #{py_attrs}
341
- split_array = #{py_split_array}
342
- retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
343
- PY
344
- @py_doc = PyCall.eval(@spacy_doc_id)
345
- end
346
-
347
- # String representation of the token.
348
- # @return [String]
349
- def to_s
350
- @text
351
- end
352
-
353
- # Returns an array of tokens contained in the doc.
354
- # @return [Array<Token>]
355
- def tokens
356
- results = []
357
- PyCall::List.(@py_doc).each do |py_token|
358
- results << Token.new(py_token)
359
- end
360
- results
361
- end
362
-
363
- # Iterates over the elements in the doc yielding a token instance.
364
- def each
365
- PyCall::List.(@py_doc).each do |py_token|
366
- yield Token.new(py_token)
367
- end
368
- end
369
-
370
- # Returns a span of the specified range within the doc.
371
- # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
372
- # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
373
- # @param optional_size [Integer] An integer representing the size of the span
374
- # @return [Span]
375
- def span(range_or_start, optional_size = nil)
376
- if optional_size
377
- start_index = range_or_start
378
- temp = tokens[start_index ... start_index + optional_size]
379
- else
380
- start_index = range_or_start.first
381
- range = range_or_start
382
- temp = tokens[range]
383
- end
384
-
385
- end_index = start_index + temp.size - 1
386
-
387
- Span.new(self, start_index: start_index, end_index: end_index)
388
- end
389
-
390
- # Returns an array of spans representing noun chunks.
391
- # @return [Array<Span>]
392
- def noun_chunks
393
- chunk_array = []
394
- py_chunks = PyCall::List.(@py_doc.noun_chunks)
395
- py_chunks.each do |py_chunk|
396
- chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
397
- end
398
- chunk_array
399
- end
400
-
401
- # Returns an array of spans representing sentences.
402
- # @return [Array<Span>]
403
- def sents
404
- sentence_array = []
405
- py_sentences = PyCall::List.(@py_doc.sents)
406
- py_sentences.each do |py_sent|
407
- sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
408
- end
409
- sentence_array
410
- end
411
-
412
- # Returns an array of spans representing named entities.
413
- # @return [Array<Span>]
414
- def ents
415
- # so that ents canbe "each"-ed in Ruby
416
- ent_array = []
417
- PyCall::List.(@py_doc.ents).each do |ent|
418
- ent_array << ent
419
- end
420
- ent_array
421
- end
422
-
423
- # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
424
- # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
425
- def [](range)
426
- if range.is_a?(Range)
427
- py_span = @py_doc[range]
428
- return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
429
- else
430
- return Token.new(@py_doc[range])
431
- end
432
- end
433
-
434
- # Returns a semantic similarity estimate.
435
- # @param other [Doc] the other doc to which a similarity estimation is made
436
- # @return [Float]
437
- def similarity(other)
438
- PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
439
- end
440
-
441
- # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
442
- # @param style [String] Either `dep` or `ent`
443
- # @param compact [Boolean] Only relevant to the `dep' style
444
- # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
445
- def displacy(style: "dep", compact: false)
446
- PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
447
- end
448
-
449
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
450
- def method_missing(name, *args)
451
- @py_doc.send(name, *args)
452
- end
453
- end
454
-
455
- # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
456
- class Matcher
457
-
458
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
459
- attr_reader :spacy_matcher_id
460
-
461
- # @return [Object] a Python `Matcher` instance accessible via `PyCall`
462
- attr_reader :py_matcher
463
-
464
- # Creates a {Matcher} instance
465
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
466
- def initialize(nlp_id)
467
- @spacy_matcher_id = "doc_#{nlp_id}_matcher"
468
- PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
469
- @py_matcher = PyCall.eval(@spacy_matcher_id)
470
- end
471
-
472
- # Adds a label string and a text pattern.
473
- # @param text [String] a label string given to the pattern
474
- # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
475
- def add(text, pattern)
476
- @py_matcher.add(text, pattern)
477
- end
478
-
479
- # Execute the match.
480
- # @param doc [Doc] An {Doc} instance
481
- # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
482
- def match(doc)
483
- str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
484
- s = StringScanner.new(str_results[1..-2])
485
- results = []
486
- while s.scan_until(/(\d+), (\d+), (\d+)/)
487
- next unless s.matched
488
- triple = s.matched.split(", ")
489
- match_id = triple[0].to_i
490
- start_index = triple[1].to_i
491
- end_index = triple[2].to_i - 1
492
- results << {match_id: match_id, start_index: start_index, end_index: end_index}
493
- end
494
- results
495
- end
496
- end
497
-
498
- # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
499
- class Language
500
-
501
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
502
- attr_reader :spacy_nlp_id
503
-
504
- # @return [Object] a Python `Language` instance accessible via `PyCall`
505
- attr_reader :py_nlp
506
-
507
- # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
508
- # @param model [String] A language model installed in the system
509
- def initialize(model = "en_core_web_sm")
510
- @spacy_nlp_id = "nlp_#{model.object_id}"
511
- PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
512
- PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
513
- @py_nlp = PyCall.eval(@spacy_nlp_id)
514
- end
515
-
516
- # Reads and analyze the given text.
517
- # @param text [String] A text to be read and analyzed
518
- def read(text)
519
- Doc.new(@spacy_nlp_id, text)
520
- end
521
-
522
- # Generates a matcher for the current language model.
523
- # @return [Matcher]
524
- def matcher
525
- Matcher.new(@spacy_nlp_id)
526
- end
527
-
528
- # A utility method to lookup a vocabulary item of the given id.
529
- # @param id [Integer] A vocabulary id
530
- # @return [Object] A Python `Lexeme` object
531
- def vocab_string_lookup(id)
532
- PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
533
- end
534
-
535
- # A utility method to list pipeline components.
536
- # @return [Array<String>] An array of text strings representing pipeline components
537
- def pipe_names
538
- pipe_array = []
539
- PyCall::List.(@py_nlp.pipe_names).each do |pipe|
540
- pipe_array << pipe
541
- end
542
- pipe_array
543
- end
544
-
545
- # A utility method to get the tokenizer Python object.
546
- # @return [Object] Python `Tokenizer` object
547
- def tokenizer
548
- return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
549
- end
550
-
551
- # A utility method to get a Python `Lexeme` object.
552
- # @param text [String] A text string representing a lexeme
553
- # @return [Object] Python `Tokenizer` object
554
- def get_lexeme(text)
555
- text = text.gsub("'", "\'")
556
- py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
557
- return py_lexeme
558
- end
559
-
560
- # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
561
- # @param vector [Object] A vector representation of a word (whether existing or non-existing)
562
- # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
563
- def most_similar(vector, n)
564
- vec_array = Numpy.asarray([vector])
565
- py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
566
- key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
567
- keys = key_texts.map{|kt| kt[0]}
568
- texts = key_texts.map{|kt| kt[1]}
569
- best_rows = PyCall::List.(py_result[1])[0]
570
- scores = PyCall::List.(py_result[2])[0]
571
-
572
- results = []
573
- n.times do |i|
574
- results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
575
- end
576
-
577
- results
578
- end
579
-
580
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
581
- def method_missing(name, *args)
582
- @py_nlp.send(name, *args)
583
- end
584
- end
585
591
 
586
592
  end
587
593
 
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.1.2"
5
+ VERSION = "0.1.3"
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-26 00:00:00.000000000 Z
11
+ date: 2021-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pycall
@@ -66,6 +66,7 @@ extra_rdoc_files: []
66
66
  files:
67
67
  - ".gitignore"
68
68
  - ".yardopts"
69
+ - CHANGELOG.md
69
70
  - Gemfile
70
71
  - Gemfile.lock
71
72
  - LICENSE.txt
@@ -123,7 +124,6 @@ files:
123
124
  - examples/linguistic_features/sentence_segmentation.rb
124
125
  - examples/linguistic_features/similarity.rb
125
126
  - examples/linguistic_features/similarity_between_spans.rb
126
- - examples/linguistic_features/special_case_tokenization_rules.rb
127
127
  - examples/linguistic_features/tokenization.rb
128
128
  - examples/rule_based_matching/creating_spans_from_matches.rb
129
129
  - examples/rule_based_matching/matcher.rb
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
149
  - !ruby/object:Gem::Version
150
150
  version: '0'
151
151
  requirements: []
152
- rubygems_version: 3.2.11
152
+ rubygems_version: 3.2.3
153
153
  signing_key:
154
154
  specification_version: 4
155
155
  summary: A wrapper module for using spaCy natural language processing library from
@@ -1,19 +0,0 @@
1
- require "ruby-spacy"
2
- require "terminal-table"
3
-
4
- nlp = Spacy::Language.new("en_core_web_sm")
5
-
6
- doc = nlp.read("gimme that")
7
-
8
- puts doc.tokens.join(" ")
9
-
10
- # Add special case rule
11
- special_case = [{ORTH: "gim"}, {ORTH: "me"}]
12
- tokenizer = nlp.tokenizer
13
- tokenizer.add_special_case("gimme", special_case)
14
-
15
- # Check new tokenization
16
- puts nlp.read("gimme that").tokens.join(" ")
17
-
18
- # gimme that
19
- # gim me that