ruby-spacy 0.1.0 → 0.1.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Gemfile.lock +3 -1
- data/README.md +123 -77
- data/examples/get_started/lexeme.rb +2 -2
- data/examples/get_started/linguistic_annotations.rb +1 -1
- data/examples/get_started/morphology.rb +45 -0
- data/examples/get_started/most_similar.rb +28 -27
- data/examples/get_started/named_entities.rb +1 -1
- data/examples/get_started/pos_tags_and_dependencies.rb +18 -18
- data/examples/get_started/similarity.rb +2 -2
- data/examples/japanese/ancestors.rb +9 -11
- data/examples/japanese/entity_annotations_and_labels.rb +1 -1
- data/examples/japanese/lemmatization.rb +1 -1
- data/examples/japanese/most_similar.rb +28 -27
- data/examples/japanese/named_entity_recognition.rb +1 -1
- data/examples/japanese/navigating_parse_tree.rb +18 -18
- data/examples/japanese/noun_chunks.rb +1 -1
- data/examples/japanese/pos_tagging.rb +20 -20
- data/examples/japanese/visualizing_dependencies.rb +2 -2
- data/examples/japanese/visualizing_named_entities.rb +1 -1
- data/examples/linguistic_features/ancestors.rb +13 -10
- data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
- data/examples/linguistic_features/information_extraction.rb +2 -2
- data/examples/linguistic_features/iterating_children.rb +2 -2
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +5 -5
- data/examples/linguistic_features/lemmatization.rb +1 -1
- data/examples/linguistic_features/named_entity_recognition.rb +1 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
- data/examples/linguistic_features/noun_chunks.rb +1 -1
- data/examples/linguistic_features/pos_tagging.rb +1 -1
- data/examples/linguistic_features/retokenize_1.rb +1 -1
- data/examples/linguistic_features/retokenize_2.rb +2 -2
- data/examples/linguistic_features/rule_based_morphology.rb +1 -1
- data/examples/linguistic_features/similarity.rb +2 -2
- data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
- data/examples/linguistic_features/similarity_between_spans.rb +2 -2
- data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
- data/lib/ruby-spacy.rb +493 -300
- data/lib/ruby-spacy/version.rb +1 -1
- data/ruby-spacy.gemspec +1 -1
- metadata +6 -5
- data/examples/linguistic_features/morphology.rb +0 -17
- data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.ents.each do |ent|
|
13
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
13
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
19
19
|
puts table
|
20
20
|
|
21
21
|
# Lemmatizer mode: rule
|
22
|
-
#
|
23
|
-
# | text | dep | head text | head pos | children
|
24
|
-
#
|
25
|
-
# | Autonomous | amod | cars | NOUN |
|
26
|
-
# | cars | nsubj | shift | VERB |
|
27
|
-
# | shift | ROOT | shift | VERB |
|
28
|
-
# | insurance | compound | liability | NOUN |
|
29
|
-
# | liability | dobj | shift | VERB |
|
30
|
-
# | toward | prep | shift | VERB |
|
31
|
-
# | manufacturers | pobj | toward | ADP |
|
32
|
-
#
|
22
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
23
|
+
# | text | dep | head text | head pos | children |
|
24
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
25
|
+
# | Autonomous | amod | cars | NOUN | |
|
26
|
+
# | cars | nsubj | shift | VERB | Autonomous |
|
27
|
+
# | shift | ROOT | shift | VERB | cars, liability, toward |
|
28
|
+
# | insurance | compound | liability | NOUN | |
|
29
|
+
# | liability | dobj | shift | VERB | insurance |
|
30
|
+
# | toward | prep | shift | VERB | manufacturers |
|
31
|
+
# | manufacturers | pobj | toward | ADP | |
|
32
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.noun_chunks.each do |chunk|
|
15
|
-
rows << [chunk.text, chunk.root.text, chunk.root.
|
15
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,7 +12,7 @@ rows = []
|
|
12
12
|
doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.pos, token.dep, token.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
sentence = "I live in New York"
|
7
7
|
doc = nlp.read(sentence)
|
8
8
|
|
9
|
-
puts "Before: " + doc.tokens.
|
9
|
+
puts "Before: " + doc.tokens.map(&:text).join(", ")
|
10
10
|
|
11
11
|
doc.retokenize(3, 4)
|
12
12
|
|
13
|
-
puts "After: " + doc.tokens.
|
13
|
+
puts "After: " + doc.tokens.map(&:text).join(", ")
|
14
14
|
|
15
15
|
# Before: I, live, in, New, York
|
16
16
|
# After: I, live, in, New York
|
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
doc = nlp.read("Where are you?")
|
7
7
|
|
8
8
|
puts "Morph features of the third word: " + doc[2].morph.to_s
|
9
|
-
puts "POS of the third word: " + doc[2].
|
9
|
+
puts "POS of the third word: " + doc[2].pos
|
10
10
|
|
11
11
|
# Morph features of the third word: Case=Nom|Person=2|PronType=Prs
|
12
12
|
# POS of the third word: PRON
|
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
5
5
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
6
|
doc2 = nlp.read("Fast food tastes very good.")
|
7
7
|
|
8
|
-
puts "Doc 1: " + doc1
|
9
|
-
puts "Doc 2: " + doc2
|
8
|
+
puts "Doc 1: " + doc1.text
|
9
|
+
puts "Doc 2: " + doc2.text
|
10
10
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
11
|
|
12
12
|
# Doc 1: I like salty fries and hamburgers.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
|
6
|
+
orange = nlp.vocab("orange")
|
7
|
+
lemon = nlp.vocab("lemon")
|
8
|
+
|
9
|
+
book = nlp.vocab("book")
|
10
|
+
magazine = nlp.vocab("magazine")
|
11
|
+
|
12
|
+
puts "orange <=> lemon: #{orange.similarity(lemon)}"
|
13
|
+
puts "book <=> magazine: #{book.similarity(magazine)}"
|
14
|
+
puts "orange <=> book: #{orange.similarity(book)}"
|
15
|
+
|
16
|
+
# orange <=> lemon: 0.7080526351928711
|
17
|
+
# book <=> magazine: 0.4355940818786621
|
18
|
+
# orange <=> book: 0.12197211384773254
|
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
5
5
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
6
|
doc2 = nlp.read("Fast food tastes very good.")
|
7
7
|
|
8
|
-
puts "Doc 1: " + doc1
|
9
|
-
puts "Doc 2: " + doc2
|
8
|
+
puts "Doc 1: " + doc1.text
|
9
|
+
puts "Doc 2: " + doc2.text
|
10
10
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
11
|
|
12
12
|
span1 = doc1.span(2, 2) # salty fries
|
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
|
|
10
10
|
|
11
11
|
matches.each do |match|
|
12
12
|
span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
|
13
|
-
puts span.text + " / " + span.
|
13
|
+
puts span.text + " / " + span.label
|
14
14
|
end
|
15
15
|
|
16
16
|
# Barack Obama / US_PRESIDENT
|
data/lib/ruby-spacy.rb
CHANGED
@@ -3,12 +3,34 @@
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
4
|
require 'enumerator'
|
5
5
|
require 'strscan'
|
6
|
-
require 'pycall/import'
|
7
6
|
require 'numpy'
|
7
|
+
require 'pycall/import'
|
8
8
|
include PyCall::Import
|
9
9
|
|
10
10
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
11
|
module Spacy
|
12
|
+
|
13
|
+
extend PyCall::Import
|
14
|
+
spacy = PyCall.import_module('spacy')
|
15
|
+
|
16
|
+
# Python `Language` class
|
17
|
+
PyLanguage = spacy.language.Language
|
18
|
+
|
19
|
+
# Python `Doc` class object
|
20
|
+
PyDoc = spacy.tokens.Doc
|
21
|
+
|
22
|
+
# Python `Span` class object
|
23
|
+
PySpan = spacy.tokens.Span
|
24
|
+
|
25
|
+
# Python `Token` class object
|
26
|
+
PyToken = spacy.tokens.Token
|
27
|
+
|
28
|
+
# Python `Matcher` class object
|
29
|
+
PyMatcher = spacy.matcher.Matcher
|
30
|
+
|
31
|
+
# Python `displacy` object
|
32
|
+
PyDisplacy = spacy.displacy
|
33
|
+
|
12
34
|
# A utility module method to convert Python's generator object to a Ruby array,
|
13
35
|
# mainly used on the items inside the array returned from dependency-related methods
|
14
36
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
@@ -16,12 +38,320 @@ module Spacy
|
|
16
38
|
PyCall::List.(py_generator)
|
17
39
|
end
|
18
40
|
|
41
|
+
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
|
+
class Doc
|
43
|
+
|
44
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
|
+
attr_reader :py_nlp
|
46
|
+
|
47
|
+
# @return [Object] a Python `Doc` instance accessible via `PyCall`
|
48
|
+
attr_reader :py_doc
|
49
|
+
|
50
|
+
# @return [String] a text string of the document
|
51
|
+
attr_reader :text
|
52
|
+
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
alias_method :length, :count
|
56
|
+
alias_method :len, :count
|
57
|
+
alias_method :size, :count
|
58
|
+
|
59
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
|
+
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
|
+
# @param nlp [Language] an instance of {Language} class
|
63
|
+
# @param py_doc [Object] an instance of Python `Doc` class
|
64
|
+
# @param text [String] the text string to be analyzed
|
65
|
+
def initialize(nlp, py_doc: nil, text: nil)
|
66
|
+
@py_nlp = nlp
|
67
|
+
if py_doc
|
68
|
+
@py_doc = py_doc
|
69
|
+
else
|
70
|
+
@py_doc = nlp.(text)
|
71
|
+
end
|
72
|
+
@text = @py_doc.text
|
73
|
+
end
|
74
|
+
|
75
|
+
# Retokenizes the text merging a span into a single token.
|
76
|
+
# @param start_index [Integer] the start position of the span to be retokenized in the document
|
77
|
+
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
|
+
# @param attributes [Hash] attributes to set on the merged token
|
79
|
+
def retokenize(start_index, end_index, attributes = {})
|
80
|
+
PyCall.with(@py_doc.retokenize()) do |retokenizer|
|
81
|
+
retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Retokenizes the text splitting the specified token.
|
86
|
+
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
|
+
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
|
+
# @param attributes [Hash] the attributes of the split elements
|
90
|
+
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
+
PyCall.with(@py_doc.retokenize()) do |retokenizer|
|
92
|
+
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
|
+
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# String representation of the document.
|
98
|
+
# @return [String]
|
99
|
+
def to_s
|
100
|
+
@text
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns an array of tokens contained in the doc.
|
104
|
+
# @return [Array<Token>]
|
105
|
+
def tokens
|
106
|
+
results = []
|
107
|
+
PyCall::List.(@py_doc).each do |py_token|
|
108
|
+
results << Token.new(py_token)
|
109
|
+
end
|
110
|
+
results
|
111
|
+
end
|
112
|
+
|
113
|
+
# Iterates over the elements in the doc yielding a token instance each time.
|
114
|
+
def each
|
115
|
+
PyCall::List.(@py_doc).each do |py_token|
|
116
|
+
yield Token.new(py_token)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Returns a span of the specified range within the doc.
|
121
|
+
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
|
+
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
|
+
# @param optional_size [Integer] an integer representing the size of the span
|
124
|
+
# @return [Span]
|
125
|
+
def span(range_or_start, optional_size = nil)
|
126
|
+
if optional_size
|
127
|
+
start_index = range_or_start
|
128
|
+
temp = tokens[start_index ... start_index + optional_size]
|
129
|
+
else
|
130
|
+
start_index = range_or_start.first
|
131
|
+
range = range_or_start
|
132
|
+
temp = tokens[range]
|
133
|
+
end
|
134
|
+
|
135
|
+
end_index = start_index + temp.size - 1
|
136
|
+
|
137
|
+
Span.new(self, start_index: start_index, end_index: end_index)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Returns an array of spans representing noun chunks.
|
141
|
+
# @return [Array<Span>]
|
142
|
+
def noun_chunks
|
143
|
+
chunk_array = []
|
144
|
+
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
145
|
+
py_chunks.each do |py_chunk|
|
146
|
+
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
|
+
end
|
148
|
+
chunk_array
|
149
|
+
end
|
150
|
+
|
151
|
+
# Returns an array of spans each representing a sentence.
|
152
|
+
# @return [Array<Span>]
|
153
|
+
def sents
|
154
|
+
sentence_array = []
|
155
|
+
py_sentences = PyCall::List.(@py_doc.sents)
|
156
|
+
py_sentences.each do |py_sent|
|
157
|
+
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
|
+
end
|
159
|
+
sentence_array
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns an array of spans each representing a named entity.
|
163
|
+
# @return [Array<Span>]
|
164
|
+
def ents
|
165
|
+
# so that ents canbe "each"-ed in Ruby
|
166
|
+
ent_array = []
|
167
|
+
PyCall::List.(@py_doc.ents).each do |ent|
|
168
|
+
ent.define_singleton_method :label do
|
169
|
+
return self.label_
|
170
|
+
end
|
171
|
+
ent_array << ent
|
172
|
+
end
|
173
|
+
ent_array
|
174
|
+
end
|
175
|
+
|
176
|
+
# Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
|
177
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
178
|
+
def [](range)
|
179
|
+
if range.is_a?(Range)
|
180
|
+
py_span = @py_doc[range]
|
181
|
+
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
182
|
+
else
|
183
|
+
return Token.new(@py_doc[range])
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns a semantic similarity estimate.
|
188
|
+
# @param other [Doc] the other doc to which a similarity estimation is made
|
189
|
+
# @return [Float]
|
190
|
+
def similarity(other)
|
191
|
+
py_doc.similarity(other.py_doc)
|
192
|
+
end
|
193
|
+
|
194
|
+
# Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
|
195
|
+
# @param style [String] either `dep` or `ent`
|
196
|
+
# @param compact [Boolean] only relevant to the `dep' style
|
197
|
+
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
198
|
+
def displacy(style: "dep", compact: false)
|
199
|
+
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
203
|
+
def method_missing(name, *args)
|
204
|
+
@py_doc.send(name, *args)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
209
|
+
class Language
|
210
|
+
|
211
|
+
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
212
|
+
attr_reader :spacy_nlp_id
|
213
|
+
|
214
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
215
|
+
attr_reader :py_nlp
|
216
|
+
|
217
|
+
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
218
|
+
# @param model [String] A language model installed in the system
|
219
|
+
def initialize(model = "en_core_web_sm")
|
220
|
+
@spacy_nlp_id = "nlp_#{model.object_id}"
|
221
|
+
PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
|
222
|
+
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
223
|
+
end
|
224
|
+
|
225
|
+
# Reads and analyze the given text.
|
226
|
+
# @param text [String] a text to be read and analyzed
|
227
|
+
def read(text)
|
228
|
+
Doc.new(py_nlp, text: text)
|
229
|
+
end
|
230
|
+
|
231
|
+
# Generates a matcher for the current language model.
|
232
|
+
# @return [Matcher]
|
233
|
+
def matcher
|
234
|
+
Matcher.new(@py_nlp)
|
235
|
+
end
|
236
|
+
|
237
|
+
# A utility method to lookup a vocabulary item of the given id.
|
238
|
+
# @param id [Integer] a vocabulary id
|
239
|
+
# @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
|
240
|
+
def vocab_string_lookup(id)
|
241
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
242
|
+
end
|
243
|
+
|
244
|
+
# A utility method to list pipeline components.
|
245
|
+
# @return [Array<String>] An array of text strings representing pipeline components
|
246
|
+
def pipe_names
|
247
|
+
pipe_array = []
|
248
|
+
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
249
|
+
pipe_array << pipe
|
250
|
+
end
|
251
|
+
pipe_array
|
252
|
+
end
|
253
|
+
|
254
|
+
# A utility method to get a Python `Lexeme` object.
|
255
|
+
# @param text [String] A text string representing a lexeme
|
256
|
+
# @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
|
257
|
+
def get_lexeme(text)
|
258
|
+
@py_nlp.vocab[text]
|
259
|
+
end
|
260
|
+
|
261
|
+
# Returns a ruby lexeme object
|
262
|
+
# @param text [String] a text string representing the vocabulary item
|
263
|
+
# @return [Lexeme]
|
264
|
+
def vocab(text)
|
265
|
+
Lexeme.new(@py_nlp.vocab[text])
|
266
|
+
end
|
267
|
+
|
268
|
+
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
269
|
+
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
270
|
+
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
271
|
+
def most_similar(vector, n)
|
272
|
+
vec_array = Numpy.asarray([vector])
|
273
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
274
|
+
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
|
275
|
+
keys = key_texts.map{|kt| kt[0]}
|
276
|
+
texts = key_texts.map{|kt| kt[1]}
|
277
|
+
best_rows = PyCall::List.(py_result[1])[0]
|
278
|
+
scores = PyCall::List.(py_result[2])[0]
|
279
|
+
|
280
|
+
results = []
|
281
|
+
n.times do |i|
|
282
|
+
result = {key: keys[i].to_i,
|
283
|
+
text: texts[i],
|
284
|
+
best_row: best_rows[i],
|
285
|
+
score: scores[i]
|
286
|
+
}
|
287
|
+
result.each_key do |key|
|
288
|
+
result.define_singleton_method(key){ result[key] }
|
289
|
+
end
|
290
|
+
results << result
|
291
|
+
end
|
292
|
+
results
|
293
|
+
end
|
294
|
+
|
295
|
+
# Utility function to batch process many texts
|
296
|
+
# @param texts [String]
|
297
|
+
# @param disable [Array<String>]
|
298
|
+
# @param batch_size [Integer]
|
299
|
+
# @return [Array<Doc>]
|
300
|
+
def pipe(texts, disable: [], batch_size: 50)
|
301
|
+
docs = []
|
302
|
+
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
303
|
+
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
304
|
+
end
|
305
|
+
docs
|
306
|
+
end
|
307
|
+
|
308
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
|
309
|
+
def method_missing(name, *args)
|
310
|
+
@py_nlp.send(name, *args)
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
315
|
+
class Matcher
|
316
|
+
|
317
|
+
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
318
|
+
attr_reader :py_matcher
|
319
|
+
|
320
|
+
# Creates a {Matcher} instance
|
321
|
+
# @param nlp [Language] an instance of {Language} class
|
322
|
+
def initialize(nlp)
|
323
|
+
@py_matcher = PyMatcher.(nlp.vocab)
|
324
|
+
end
|
325
|
+
|
326
|
+
# Adds a label string and a text pattern.
|
327
|
+
# @param text [String] a label string given to the pattern
|
328
|
+
# @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
|
329
|
+
def add(text, pattern)
|
330
|
+
@py_matcher.add(text, pattern)
|
331
|
+
end
|
332
|
+
|
333
|
+
# Execute the match.
|
334
|
+
# @param doc [Doc] an {Doc} instance
|
335
|
+
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
336
|
+
def match(doc)
|
337
|
+
str_results = @py_matcher.(doc.py_doc).to_s
|
338
|
+
s = StringScanner.new(str_results[1..-2])
|
339
|
+
results = []
|
340
|
+
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
341
|
+
next unless s.matched
|
342
|
+
triple = s.matched.split(", ")
|
343
|
+
match_id = triple[0].to_i
|
344
|
+
start_index = triple[1].to_i
|
345
|
+
end_index = triple[2].to_i - 1
|
346
|
+
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
347
|
+
end
|
348
|
+
results
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
19
352
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
20
353
|
class Span
|
21
354
|
|
22
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
23
|
-
attr_reader :spacy_span_id
|
24
|
-
|
25
355
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
26
356
|
attr_reader :py_span
|
27
357
|
|
@@ -35,21 +365,18 @@ module Spacy
|
|
35
365
|
alias_method :size, :count
|
36
366
|
|
37
367
|
# It is recommended to use {Doc#span} method to create a span. If you need to
|
38
|
-
# create one using {Span#initialize},
|
368
|
+
# create one using {Span#initialize}, there are two method signatures:
|
369
|
+
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
39
370
|
# @param doc [Doc] the document to which this span belongs to
|
40
371
|
# @param start_index [Integer] the index of the item starting the span inside a doc
|
41
372
|
# @param end_index [Integer] the index of the item ending the span inside a doc
|
42
373
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
43
374
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
44
375
|
@doc = doc
|
45
|
-
@spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
|
46
376
|
if py_span
|
47
377
|
@py_span = py_span
|
48
378
|
else
|
49
|
-
|
50
|
-
PyCall.exec("#{@spacy_span_id}_opts = #{options}")
|
51
|
-
PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
|
52
|
-
@py_span = PyCall.eval(@spacy_span_id)
|
379
|
+
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
53
380
|
end
|
54
381
|
end
|
55
382
|
|
@@ -63,7 +390,7 @@ module Spacy
|
|
63
390
|
results
|
64
391
|
end
|
65
392
|
|
66
|
-
# Iterates over the elements in the span yielding a token instance.
|
393
|
+
# Iterates over the elements in the span yielding a token instance each time.
|
67
394
|
def each
|
68
395
|
PyCall::List.(@py_span).each do |py_token|
|
69
396
|
yield Token.new(py_token)
|
@@ -76,18 +403,24 @@ module Spacy
|
|
76
403
|
chunk_array = []
|
77
404
|
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
78
405
|
py_chunks.each do |py_span|
|
79
|
-
chunk_array <<
|
406
|
+
chunk_array << Span.new(@doc, py_span: py_span)
|
80
407
|
end
|
81
408
|
chunk_array
|
82
409
|
end
|
83
410
|
|
411
|
+
# Returns the head token
|
412
|
+
# @return [Token]
|
413
|
+
def root
|
414
|
+
Token.new(@py_span.root)
|
415
|
+
end
|
416
|
+
|
84
417
|
# Returns an array of spans that represents sentences.
|
85
418
|
# @return [Array<Span>]
|
86
419
|
def sents
|
87
420
|
sentence_array = []
|
88
421
|
py_sentences = PyCall::List.(@py_span.sents)
|
89
422
|
py_sentences.each do |py_span|
|
90
|
-
sentence_array <<
|
423
|
+
sentence_array << Span.new(@doc, py_span: py_span)
|
91
424
|
end
|
92
425
|
sentence_array
|
93
426
|
end
|
@@ -97,8 +430,7 @@ module Spacy
|
|
97
430
|
def ents
|
98
431
|
ent_array = []
|
99
432
|
PyCall::List.(@py_span.ents).each do |py_span|
|
100
|
-
|
101
|
-
ent_array << Spacy::Span.new(@doc, py_span: py_span)
|
433
|
+
ent_array << Span.new(@doc, py_span: py_span)
|
102
434
|
end
|
103
435
|
ent_array
|
104
436
|
end
|
@@ -106,18 +438,18 @@ module Spacy
|
|
106
438
|
# Returns a span that represents the sentence that the given span is part of.
|
107
439
|
# @return [Span]
|
108
440
|
def sent
|
109
|
-
py_span
|
110
|
-
return
|
441
|
+
py_span = @py_span.sent
|
442
|
+
return Span.new(@doc, py_span: py_span)
|
111
443
|
end
|
112
444
|
|
113
|
-
# Returns a span if a range object is given
|
445
|
+
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
114
446
|
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
115
447
|
def [](range)
|
116
448
|
if range.is_a?(Range)
|
117
449
|
py_span = @py_span[range]
|
118
|
-
return
|
450
|
+
return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
119
451
|
else
|
120
|
-
return
|
452
|
+
return Token.new(@py_span[range])
|
121
453
|
end
|
122
454
|
end
|
123
455
|
|
@@ -125,31 +457,31 @@ module Spacy
|
|
125
457
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
126
458
|
# @return [Float]
|
127
459
|
def similarity(other)
|
128
|
-
|
460
|
+
py_span.similarity(other.py_span)
|
129
461
|
end
|
130
462
|
|
131
|
-
# Creates a document instance
|
463
|
+
# Creates a document instance from the span
|
132
464
|
# @return [Doc]
|
133
465
|
def as_doc
|
134
|
-
|
466
|
+
Doc.new(@doc.py_nlp, text: self.text)
|
135
467
|
end
|
136
468
|
|
137
|
-
# Returns
|
469
|
+
# Returns tokens conjugated to the root of the span.
|
138
470
|
# @return [Array<Token>] an array of tokens
|
139
471
|
def conjuncts
|
140
472
|
conjunct_array = []
|
141
473
|
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
142
|
-
conjunct_array <<
|
474
|
+
conjunct_array << Token.new(py_conjunct)
|
143
475
|
end
|
144
476
|
conjunct_array
|
145
477
|
end
|
146
478
|
|
147
|
-
# Returns
|
479
|
+
# Returns tokens that are to the left of the span, whose heads are within the span.
|
148
480
|
# @return [Array<Token>] an array of tokens
|
149
481
|
def lefts
|
150
482
|
left_array = []
|
151
483
|
PyCall::List.(@py_span.lefts).each do |py_left|
|
152
|
-
left_array <<
|
484
|
+
left_array << Token.new(py_left)
|
153
485
|
end
|
154
486
|
left_array
|
155
487
|
end
|
@@ -159,7 +491,7 @@ module Spacy
|
|
159
491
|
def rights
|
160
492
|
right_array = []
|
161
493
|
PyCall::List.(@py_span.rights).each do |py_right|
|
162
|
-
right_array <<
|
494
|
+
right_array << Token.new(py_right)
|
163
495
|
end
|
164
496
|
right_array
|
165
497
|
end
|
@@ -169,11 +501,17 @@ module Spacy
|
|
169
501
|
def subtree
|
170
502
|
subtree_array = []
|
171
503
|
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
172
|
-
subtree_array <<
|
504
|
+
subtree_array << Token.new(py_subtree)
|
173
505
|
end
|
174
506
|
subtree_array
|
175
507
|
end
|
176
508
|
|
509
|
+
# Returns the label
|
510
|
+
# @return [String]
|
511
|
+
def label
|
512
|
+
@py_span.label_
|
513
|
+
end
|
514
|
+
|
177
515
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
178
516
|
def method_missing(name, *args)
|
179
517
|
@py_span.send(name, *args)
|
@@ -189,59 +527,67 @@ module Spacy
|
|
189
527
|
# @return [String] a string representing the token
|
190
528
|
attr_reader :text
|
191
529
|
|
192
|
-
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
530
|
+
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
531
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
|
193
532
|
# @param py_token [Object] Python `Token` object
|
194
533
|
def initialize(py_token)
|
195
534
|
@py_token = py_token
|
196
535
|
@text = @py_token.text
|
197
536
|
end
|
198
537
|
|
538
|
+
|
539
|
+
# Returns the head token
|
540
|
+
# @return [Token]
|
541
|
+
def head
|
542
|
+
Token.new(@py_token.head)
|
543
|
+
end
|
544
|
+
|
199
545
|
# Returns the token in question and the tokens that descend from it.
|
200
|
-
# @return [Array<
|
546
|
+
# @return [Array<Token>] an array of tokens
|
201
547
|
def subtree
|
202
548
|
descendant_array = []
|
203
549
|
PyCall::List.(@py_token.subtree).each do |descendant|
|
204
|
-
descendant_array << descendant
|
550
|
+
descendant_array << Token.new(descendant)
|
205
551
|
end
|
206
552
|
descendant_array
|
207
553
|
end
|
208
554
|
|
209
555
|
# Returns the token's ancestors.
|
210
|
-
# @return [Array<
|
556
|
+
# @return [Array<Token>] an array of tokens
|
211
557
|
def ancestors
|
212
558
|
ancestor_array = []
|
213
559
|
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
214
|
-
ancestor_array << ancestor
|
560
|
+
ancestor_array << Token.new(ancestor)
|
215
561
|
end
|
216
562
|
ancestor_array
|
217
563
|
end
|
218
564
|
|
219
565
|
# Returns a sequence of the token's immediate syntactic children.
|
220
|
-
# @return [Array<
|
566
|
+
# @return [Array<Token>] an array of tokens
|
221
567
|
def children
|
222
568
|
child_array = []
|
223
569
|
PyCall::List.(@py_token.children).each do |child|
|
224
|
-
child_array << child
|
570
|
+
child_array << Token.new(child)
|
225
571
|
end
|
226
572
|
child_array
|
227
573
|
end
|
228
574
|
|
229
575
|
# The leftward immediate children of the word in the syntactic dependency parse.
|
230
|
-
# @return [Array<
|
576
|
+
# @return [Array<Token>] an array of tokens
|
231
577
|
def lefts
|
232
578
|
token_array = []
|
233
579
|
PyCall::List.(@py_token.lefts).each do |token|
|
234
|
-
token_array << token
|
580
|
+
token_array << Token.new(token)
|
235
581
|
end
|
236
582
|
token_array
|
237
583
|
end
|
238
584
|
|
239
585
|
# The rightward immediate children of the word in the syntactic dependency parse.
|
240
|
-
# @return [Array<
|
586
|
+
# @return [Array<Token>] an array of tokens
|
241
587
|
def rights
|
242
588
|
token_array = []
|
243
589
|
PyCall::List.(@py_token.rights).each do |token|
|
244
|
-
token_array << token
|
590
|
+
token_array << Token.new(token)
|
245
591
|
end
|
246
592
|
token_array
|
247
593
|
end
|
@@ -252,314 +598,161 @@ module Spacy
|
|
252
598
|
@text
|
253
599
|
end
|
254
600
|
|
255
|
-
#
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
# @return [String] a text string of the document
|
274
|
-
attr_reader :text
|
275
|
-
|
276
|
-
include Enumerable
|
277
|
-
|
278
|
-
alias_method :length, :count
|
279
|
-
alias_method :len, :count
|
280
|
-
alias_method :size, :count
|
281
|
-
|
282
|
-
# Creates a new instance of {Doc}.
|
283
|
-
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
284
|
-
# @param text [String] The text string to be analyzed
|
285
|
-
def initialize(nlp_id, text)
|
286
|
-
@text = text
|
287
|
-
@spacy_nlp_id = nlp_id
|
288
|
-
@spacy_doc_id = "doc_#{text.object_id}"
|
289
|
-
quoted = text.gsub('"', '\"')
|
290
|
-
PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
|
291
|
-
PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
|
292
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
293
|
-
end
|
294
|
-
|
295
|
-
|
296
|
-
# Retokenizes the text merging a span into a single token.
|
297
|
-
# @param start_index [Integer] The start position of the span to be retokenized in the document
|
298
|
-
# @param end_index [Integer] The end position of the span to be retokenized in the document
|
299
|
-
# @param attributes [Hash] Attributes to set on the merged token
|
300
|
-
def retokenize(start_index, end_index, attributes = {})
|
301
|
-
py_attrs = PyCall::Dict.(attributes)
|
302
|
-
PyCall.exec(<<PY)
|
303
|
-
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
304
|
-
retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
|
305
|
-
PY
|
306
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
307
|
-
end
|
308
|
-
|
309
|
-
# Retokenizes the text splitting the specified token.
|
310
|
-
# @param pos_in_doc [Integer] The position of the span to be retokenized in the document
|
311
|
-
# @param split_array [Array<String>] text strings of the split results
|
312
|
-
# @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
|
313
|
-
# @param attributes [Hash] The attributes of the split elements
|
314
|
-
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
315
|
-
py_attrs = PyCall::Dict.(attributes)
|
316
|
-
py_split_array = PyCall::List.(split_array)
|
317
|
-
PyCall.exec(<<PY)
|
318
|
-
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
319
|
-
heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
|
320
|
-
attrs = #{py_attrs}
|
321
|
-
split_array = #{py_split_array}
|
322
|
-
retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
|
323
|
-
PY
|
324
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
601
|
+
# Returns a hash or string of morphological information
|
602
|
+
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
603
|
+
# @return [Hash, String]
|
604
|
+
def morphology(hash = true)
|
605
|
+
if @py_token.has_morph
|
606
|
+
morph_analysis = @py_token.morph
|
607
|
+
if hash
|
608
|
+
return morph_analysis.to_dict
|
609
|
+
else
|
610
|
+
return morph_analysis.to_s
|
611
|
+
end
|
612
|
+
else
|
613
|
+
if hash
|
614
|
+
results = {}
|
615
|
+
else
|
616
|
+
return ""
|
617
|
+
end
|
618
|
+
end
|
325
619
|
end
|
326
620
|
|
327
|
-
#
|
621
|
+
# Returns the lemma by calling `lemma_' of `@py_token` object
|
328
622
|
# @return [String]
|
329
|
-
def
|
330
|
-
@
|
623
|
+
def lemma
|
624
|
+
@py_token.lemma_
|
331
625
|
end
|
332
626
|
|
333
|
-
# Returns
|
334
|
-
# @return [
|
335
|
-
def
|
336
|
-
|
337
|
-
PyCall::List.(@py_doc).each do |py_token|
|
338
|
-
results << Token.new(py_token)
|
339
|
-
end
|
340
|
-
results
|
627
|
+
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
628
|
+
# @return [String]
|
629
|
+
def lower
|
630
|
+
@py_token.lower_
|
341
631
|
end
|
342
632
|
|
343
|
-
#
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
end
|
633
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
634
|
+
# @return [String]
|
635
|
+
def shape
|
636
|
+
@py_token.shape_
|
348
637
|
end
|
349
638
|
|
350
|
-
# Returns
|
351
|
-
#
|
352
|
-
|
353
|
-
|
354
|
-
# @return [Span]
|
355
|
-
def span(range_or_start, optional_size = nil)
|
356
|
-
if optional_size
|
357
|
-
start_index = range_or_start
|
358
|
-
temp = tokens[start_index ... start_index + optional_size]
|
359
|
-
else
|
360
|
-
start_index = range_or_start.first
|
361
|
-
range = range_or_start
|
362
|
-
temp = tokens[range]
|
363
|
-
end
|
364
|
-
|
365
|
-
end_index = start_index + temp.size - 1
|
366
|
-
|
367
|
-
Span.new(self, start_index: start_index, end_index: end_index)
|
639
|
+
# Returns the pos by calling `pos_' of `@py_token` object
|
640
|
+
# @return [String]
|
641
|
+
def pos
|
642
|
+
@py_token.pos_
|
368
643
|
end
|
369
644
|
|
370
|
-
# Returns
|
371
|
-
# @return [
|
372
|
-
def
|
373
|
-
|
374
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
375
|
-
py_chunks.each do |py_chunk|
|
376
|
-
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
377
|
-
end
|
378
|
-
chunk_array
|
645
|
+
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
646
|
+
# @return [String]
|
647
|
+
def tag
|
648
|
+
@py_token.tag_
|
379
649
|
end
|
380
650
|
|
381
|
-
# Returns
|
382
|
-
# @return [
|
383
|
-
def
|
384
|
-
|
385
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
386
|
-
py_sentences.each do |py_sent|
|
387
|
-
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
388
|
-
end
|
389
|
-
sentence_array
|
651
|
+
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
652
|
+
# @return [String]
|
653
|
+
def dep
|
654
|
+
@py_token.dep_
|
390
655
|
end
|
391
|
-
|
392
|
-
# Returns
|
393
|
-
# @return [
|
394
|
-
def
|
395
|
-
|
396
|
-
ent_array = []
|
397
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
398
|
-
ent_array << ent
|
399
|
-
end
|
400
|
-
ent_array
|
656
|
+
|
657
|
+
# Returns the language by calling `lang_' of `@py_token` object
|
658
|
+
# @return [String]
|
659
|
+
def lang
|
660
|
+
@py_token.lang_
|
401
661
|
end
|
402
662
|
|
403
|
-
# Returns
|
404
|
-
# @
|
405
|
-
def
|
406
|
-
|
407
|
-
py_span = @py_doc[range]
|
408
|
-
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
409
|
-
else
|
410
|
-
return Token.new(@py_doc[range])
|
411
|
-
end
|
663
|
+
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
664
|
+
# @return [String]
|
665
|
+
def whitespace
|
666
|
+
@py_token.whitespace_
|
412
667
|
end
|
413
668
|
|
414
|
-
# Returns
|
415
|
-
# @
|
416
|
-
|
417
|
-
|
418
|
-
PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
|
669
|
+
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
670
|
+
# @return [String]
|
671
|
+
def ent_type
|
672
|
+
@py_token.ent_type_
|
419
673
|
end
|
420
674
|
|
421
|
-
#
|
422
|
-
# @
|
423
|
-
|
424
|
-
|
425
|
-
def displacy(style: "dep", compact: false)
|
426
|
-
PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
|
675
|
+
# Returns a lexeme object
|
676
|
+
# @return [Lexeme]
|
677
|
+
def lexeme
|
678
|
+
Lexeme.new(@py_token.lex)
|
427
679
|
end
|
428
680
|
|
429
681
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
430
682
|
def method_missing(name, *args)
|
431
|
-
@
|
683
|
+
@py_token.send(name, *args)
|
432
684
|
end
|
433
685
|
end
|
434
686
|
|
435
|
-
# See also spaCy Python API document for [`
|
436
|
-
class
|
687
|
+
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
688
|
+
class Lexeme
|
437
689
|
|
438
|
-
# @return [
|
439
|
-
attr_reader :
|
440
|
-
|
441
|
-
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
442
|
-
attr_reader :py_matcher
|
443
|
-
|
444
|
-
# Creates a {Matcher} instance
|
445
|
-
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
446
|
-
def initialize(nlp_id)
|
447
|
-
@spacy_matcher_id = "doc_#{nlp_id}_matcher"
|
448
|
-
PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
|
449
|
-
@py_matcher = PyCall.eval(@spacy_matcher_id)
|
450
|
-
end
|
690
|
+
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
691
|
+
attr_reader :py_lexeme
|
451
692
|
|
452
|
-
#
|
453
|
-
|
454
|
-
# @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
|
455
|
-
def add(text, pattern)
|
456
|
-
@py_matcher.add(text, pattern)
|
457
|
-
end
|
693
|
+
# @return [String] a string representing the token
|
694
|
+
attr_reader :text
|
458
695
|
|
459
|
-
#
|
460
|
-
#
|
461
|
-
# @
|
462
|
-
def
|
463
|
-
|
464
|
-
|
465
|
-
results = []
|
466
|
-
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
467
|
-
next unless s.matched
|
468
|
-
triple = s.matched.split(", ")
|
469
|
-
match_id = triple[0].to_i
|
470
|
-
start_index = triple[1].to_i
|
471
|
-
end_index = triple[2].to_i - 1
|
472
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
473
|
-
end
|
474
|
-
results
|
696
|
+
# It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
|
697
|
+
# There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
|
698
|
+
# @param py_lexeme [Object] Python `Lexeme` object
|
699
|
+
def initialize(py_lexeme)
|
700
|
+
@py_lexeme = py_lexeme
|
701
|
+
@text = @py_lexeme.text
|
475
702
|
end
|
476
|
-
end
|
477
|
-
|
478
|
-
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
479
|
-
class Language
|
480
|
-
|
481
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
482
|
-
attr_reader :spacy_nlp_id
|
483
|
-
|
484
|
-
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
485
|
-
attr_reader :py_nlp
|
486
703
|
|
487
|
-
#
|
488
|
-
# @
|
489
|
-
def
|
490
|
-
@
|
491
|
-
PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
|
492
|
-
PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
|
493
|
-
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
704
|
+
# String representation of the token.
|
705
|
+
# @return [String]
|
706
|
+
def to_s
|
707
|
+
@text
|
494
708
|
end
|
495
709
|
|
496
|
-
#
|
497
|
-
# @
|
498
|
-
def
|
499
|
-
|
710
|
+
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
711
|
+
# @return [String]
|
712
|
+
def lower
|
713
|
+
@py_lexeme.lower_
|
500
714
|
end
|
501
715
|
|
502
|
-
#
|
503
|
-
# @return [
|
504
|
-
def
|
505
|
-
|
716
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
717
|
+
# @return [String]
|
718
|
+
def shape
|
719
|
+
@py_lexeme.shape_
|
506
720
|
end
|
507
721
|
|
508
|
-
#
|
509
|
-
# @
|
510
|
-
|
511
|
-
|
512
|
-
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
722
|
+
# Returns the language by calling `lang_' of `@py_lexeme` object
|
723
|
+
# @return [String]
|
724
|
+
def lang
|
725
|
+
@py_lexeme.lang_
|
513
726
|
end
|
514
727
|
|
515
|
-
#
|
516
|
-
# @return [
|
517
|
-
def
|
518
|
-
|
519
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
520
|
-
pipe_array << pipe
|
521
|
-
end
|
522
|
-
pipe_array
|
728
|
+
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
729
|
+
# @return [String]
|
730
|
+
def prefix
|
731
|
+
@py_lexeme.prefix_
|
523
732
|
end
|
524
|
-
|
525
|
-
#
|
526
|
-
# @return [
|
527
|
-
def
|
528
|
-
|
733
|
+
#
|
734
|
+
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
735
|
+
# @return [String]
|
736
|
+
def suffix
|
737
|
+
@py_lexeme.suffix_
|
529
738
|
end
|
530
739
|
|
531
|
-
#
|
532
|
-
# @
|
533
|
-
|
534
|
-
|
535
|
-
text = text.gsub("'", "\'")
|
536
|
-
py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
|
537
|
-
return py_lexeme
|
740
|
+
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
741
|
+
# @return [String]
|
742
|
+
def norm
|
743
|
+
@py_lexeme.norm_
|
538
744
|
end
|
539
745
|
|
540
|
-
# Returns
|
541
|
-
# @param
|
542
|
-
# @return [
|
543
|
-
def
|
544
|
-
|
545
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
546
|
-
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
|
547
|
-
keys = key_texts.map{|kt| kt[0]}
|
548
|
-
texts = key_texts.map{|kt| kt[1]}
|
549
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
550
|
-
scores = PyCall::List.(py_result[2])[0]
|
551
|
-
|
552
|
-
results = []
|
553
|
-
n.times do |i|
|
554
|
-
results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
|
555
|
-
end
|
556
|
-
|
557
|
-
results
|
746
|
+
# Returns a semantic similarity estimate.
|
747
|
+
# @param other [Lexeme] the other doc to which a similarity estimation is made
|
748
|
+
# @return [Float]
|
749
|
+
def similarity(other)
|
750
|
+
@py_lexeme.similarity(other.py_lexeme)
|
558
751
|
end
|
559
752
|
|
560
|
-
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism
|
753
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
561
754
|
def method_missing(name, *args)
|
562
|
-
@
|
755
|
+
@py_lexeme.send(name, *args)
|
563
756
|
end
|
564
757
|
end
|
565
758
|
|