ruby-spacy 0.1.0 → 0.1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Gemfile.lock +3 -1
- data/README.md +123 -77
- data/examples/get_started/lexeme.rb +2 -2
- data/examples/get_started/linguistic_annotations.rb +1 -1
- data/examples/get_started/morphology.rb +45 -0
- data/examples/get_started/most_similar.rb +28 -27
- data/examples/get_started/named_entities.rb +1 -1
- data/examples/get_started/pos_tags_and_dependencies.rb +18 -18
- data/examples/get_started/similarity.rb +2 -2
- data/examples/japanese/ancestors.rb +9 -11
- data/examples/japanese/entity_annotations_and_labels.rb +1 -1
- data/examples/japanese/lemmatization.rb +1 -1
- data/examples/japanese/most_similar.rb +28 -27
- data/examples/japanese/named_entity_recognition.rb +1 -1
- data/examples/japanese/navigating_parse_tree.rb +18 -18
- data/examples/japanese/noun_chunks.rb +1 -1
- data/examples/japanese/pos_tagging.rb +20 -20
- data/examples/japanese/visualizing_dependencies.rb +2 -2
- data/examples/japanese/visualizing_named_entities.rb +1 -1
- data/examples/linguistic_features/ancestors.rb +13 -10
- data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
- data/examples/linguistic_features/information_extraction.rb +2 -2
- data/examples/linguistic_features/iterating_children.rb +2 -2
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +5 -5
- data/examples/linguistic_features/lemmatization.rb +1 -1
- data/examples/linguistic_features/named_entity_recognition.rb +1 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
- data/examples/linguistic_features/noun_chunks.rb +1 -1
- data/examples/linguistic_features/pos_tagging.rb +1 -1
- data/examples/linguistic_features/retokenize_1.rb +1 -1
- data/examples/linguistic_features/retokenize_2.rb +2 -2
- data/examples/linguistic_features/rule_based_morphology.rb +1 -1
- data/examples/linguistic_features/similarity.rb +2 -2
- data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
- data/examples/linguistic_features/similarity_between_spans.rb +2 -2
- data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
- data/lib/ruby-spacy.rb +493 -300
- data/lib/ruby-spacy/version.rb +1 -1
- data/ruby-spacy.gemspec +1 -1
- metadata +6 -5
- data/examples/linguistic_features/morphology.rb +0 -17
- data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.ents.each do |ent|
|
13
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
13
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
19
19
|
puts table
|
20
20
|
|
21
21
|
# Lemmatizer mode: rule
|
22
|
-
#
|
23
|
-
# | text | dep | head text | head pos | children
|
24
|
-
#
|
25
|
-
# | Autonomous | amod | cars | NOUN |
|
26
|
-
# | cars | nsubj | shift | VERB |
|
27
|
-
# | shift | ROOT | shift | VERB |
|
28
|
-
# | insurance | compound | liability | NOUN |
|
29
|
-
# | liability | dobj | shift | VERB |
|
30
|
-
# | toward | prep | shift | VERB |
|
31
|
-
# | manufacturers | pobj | toward | ADP |
|
32
|
-
#
|
22
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
23
|
+
# | text | dep | head text | head pos | children |
|
24
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
25
|
+
# | Autonomous | amod | cars | NOUN | |
|
26
|
+
# | cars | nsubj | shift | VERB | Autonomous |
|
27
|
+
# | shift | ROOT | shift | VERB | cars, liability, toward |
|
28
|
+
# | insurance | compound | liability | NOUN | |
|
29
|
+
# | liability | dobj | shift | VERB | insurance |
|
30
|
+
# | toward | prep | shift | VERB | manufacturers |
|
31
|
+
# | manufacturers | pobj | toward | ADP | |
|
32
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.noun_chunks.each do |chunk|
|
15
|
-
rows << [chunk.text, chunk.root.text, chunk.root.
|
15
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,7 +12,7 @@ rows = []
|
|
12
12
|
doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.pos, token.dep, token.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
sentence = "I live in New York"
|
7
7
|
doc = nlp.read(sentence)
|
8
8
|
|
9
|
-
puts "Before: " + doc.tokens.
|
9
|
+
puts "Before: " + doc.tokens.map(&:text).join(", ")
|
10
10
|
|
11
11
|
doc.retokenize(3, 4)
|
12
12
|
|
13
|
-
puts "After: " + doc.tokens.
|
13
|
+
puts "After: " + doc.tokens.map(&:text).join(", ")
|
14
14
|
|
15
15
|
# Before: I, live, in, New, York
|
16
16
|
# After: I, live, in, New York
|
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
doc = nlp.read("Where are you?")
|
7
7
|
|
8
8
|
puts "Morph features of the third word: " + doc[2].morph.to_s
|
9
|
-
puts "POS of the third word: " + doc[2].
|
9
|
+
puts "POS of the third word: " + doc[2].pos
|
10
10
|
|
11
11
|
# Morph features of the third word: Case=Nom|Person=2|PronType=Prs
|
12
12
|
# POS of the third word: PRON
|
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
5
5
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
6
|
doc2 = nlp.read("Fast food tastes very good.")
|
7
7
|
|
8
|
-
puts "Doc 1: " + doc1
|
9
|
-
puts "Doc 2: " + doc2
|
8
|
+
puts "Doc 1: " + doc1.text
|
9
|
+
puts "Doc 2: " + doc2.text
|
10
10
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
11
|
|
12
12
|
# Doc 1: I like salty fries and hamburgers.
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
|
6
|
+
orange = nlp.vocab("orange")
|
7
|
+
lemon = nlp.vocab("lemon")
|
8
|
+
|
9
|
+
book = nlp.vocab("book")
|
10
|
+
magazine = nlp.vocab("magazine")
|
11
|
+
|
12
|
+
puts "orange <=> lemon: #{orange.similarity(lemon)}"
|
13
|
+
puts "book <=> magazine: #{book.similarity(magazine)}"
|
14
|
+
puts "orange <=> book: #{orange.similarity(book)}"
|
15
|
+
|
16
|
+
# orange <=> lemon: 0.7080526351928711
|
17
|
+
# book <=> magazine: 0.4355940818786621
|
18
|
+
# orange <=> book: 0.12197211384773254
|
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
5
5
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
6
|
doc2 = nlp.read("Fast food tastes very good.")
|
7
7
|
|
8
|
-
puts "Doc 1: " + doc1
|
9
|
-
puts "Doc 2: " + doc2
|
8
|
+
puts "Doc 1: " + doc1.text
|
9
|
+
puts "Doc 2: " + doc2.text
|
10
10
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
11
|
|
12
12
|
span1 = doc1.span(2, 2) # salty fries
|
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
|
|
10
10
|
|
11
11
|
matches.each do |match|
|
12
12
|
span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
|
13
|
-
puts span.text + " / " + span.
|
13
|
+
puts span.text + " / " + span.label
|
14
14
|
end
|
15
15
|
|
16
16
|
# Barack Obama / US_PRESIDENT
|
data/lib/ruby-spacy.rb
CHANGED
@@ -3,12 +3,34 @@
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
4
|
require 'enumerator'
|
5
5
|
require 'strscan'
|
6
|
-
require 'pycall/import'
|
7
6
|
require 'numpy'
|
7
|
+
require 'pycall/import'
|
8
8
|
include PyCall::Import
|
9
9
|
|
10
10
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
11
|
module Spacy
|
12
|
+
|
13
|
+
extend PyCall::Import
|
14
|
+
spacy = PyCall.import_module('spacy')
|
15
|
+
|
16
|
+
# Python `Language` class
|
17
|
+
PyLanguage = spacy.language.Language
|
18
|
+
|
19
|
+
# Python `Doc` class object
|
20
|
+
PyDoc = spacy.tokens.Doc
|
21
|
+
|
22
|
+
# Python `Span` class object
|
23
|
+
PySpan = spacy.tokens.Span
|
24
|
+
|
25
|
+
# Python `Token` class object
|
26
|
+
PyToken = spacy.tokens.Token
|
27
|
+
|
28
|
+
# Python `Matcher` class object
|
29
|
+
PyMatcher = spacy.matcher.Matcher
|
30
|
+
|
31
|
+
# Python `displacy` object
|
32
|
+
PyDisplacy = spacy.displacy
|
33
|
+
|
12
34
|
# A utility module method to convert Python's generator object to a Ruby array,
|
13
35
|
# mainly used on the items inside the array returned from dependency-related methods
|
14
36
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
@@ -16,12 +38,320 @@ module Spacy
|
|
16
38
|
PyCall::List.(py_generator)
|
17
39
|
end
|
18
40
|
|
41
|
+
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
|
+
class Doc
|
43
|
+
|
44
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
|
+
attr_reader :py_nlp
|
46
|
+
|
47
|
+
# @return [Object] a Python `Doc` instance accessible via `PyCall`
|
48
|
+
attr_reader :py_doc
|
49
|
+
|
50
|
+
# @return [String] a text string of the document
|
51
|
+
attr_reader :text
|
52
|
+
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
alias_method :length, :count
|
56
|
+
alias_method :len, :count
|
57
|
+
alias_method :size, :count
|
58
|
+
|
59
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
|
+
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
|
+
# @param nlp [Language] an instance of {Language} class
|
63
|
+
# @param py_doc [Object] an instance of Python `Doc` class
|
64
|
+
# @param text [String] the text string to be analyzed
|
65
|
+
def initialize(nlp, py_doc: nil, text: nil)
|
66
|
+
@py_nlp = nlp
|
67
|
+
if py_doc
|
68
|
+
@py_doc = py_doc
|
69
|
+
else
|
70
|
+
@py_doc = nlp.(text)
|
71
|
+
end
|
72
|
+
@text = @py_doc.text
|
73
|
+
end
|
74
|
+
|
75
|
+
# Retokenizes the text merging a span into a single token.
|
76
|
+
# @param start_index [Integer] the start position of the span to be retokenized in the document
|
77
|
+
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
|
+
# @param attributes [Hash] attributes to set on the merged token
|
79
|
+
def retokenize(start_index, end_index, attributes = {})
|
80
|
+
PyCall.with(@py_doc.retokenize()) do |retokenizer|
|
81
|
+
retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Retokenizes the text splitting the specified token.
|
86
|
+
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
|
+
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
|
+
# @param attributes [Hash] the attributes of the split elements
|
90
|
+
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
+
PyCall.with(@py_doc.retokenize()) do |retokenizer|
|
92
|
+
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
|
+
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# String representation of the document.
|
98
|
+
# @return [String]
|
99
|
+
def to_s
|
100
|
+
@text
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns an array of tokens contained in the doc.
|
104
|
+
# @return [Array<Token>]
|
105
|
+
def tokens
|
106
|
+
results = []
|
107
|
+
PyCall::List.(@py_doc).each do |py_token|
|
108
|
+
results << Token.new(py_token)
|
109
|
+
end
|
110
|
+
results
|
111
|
+
end
|
112
|
+
|
113
|
+
# Iterates over the elements in the doc yielding a token instance each time.
|
114
|
+
def each
|
115
|
+
PyCall::List.(@py_doc).each do |py_token|
|
116
|
+
yield Token.new(py_token)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Returns a span of the specified range within the doc.
|
121
|
+
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
|
+
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
|
+
# @param optional_size [Integer] an integer representing the size of the span
|
124
|
+
# @return [Span]
|
125
|
+
def span(range_or_start, optional_size = nil)
|
126
|
+
if optional_size
|
127
|
+
start_index = range_or_start
|
128
|
+
temp = tokens[start_index ... start_index + optional_size]
|
129
|
+
else
|
130
|
+
start_index = range_or_start.first
|
131
|
+
range = range_or_start
|
132
|
+
temp = tokens[range]
|
133
|
+
end
|
134
|
+
|
135
|
+
end_index = start_index + temp.size - 1
|
136
|
+
|
137
|
+
Span.new(self, start_index: start_index, end_index: end_index)
|
138
|
+
end
|
139
|
+
|
140
|
+
# Returns an array of spans representing noun chunks.
|
141
|
+
# @return [Array<Span>]
|
142
|
+
def noun_chunks
|
143
|
+
chunk_array = []
|
144
|
+
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
145
|
+
py_chunks.each do |py_chunk|
|
146
|
+
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
|
+
end
|
148
|
+
chunk_array
|
149
|
+
end
|
150
|
+
|
151
|
+
# Returns an array of spans each representing a sentence.
|
152
|
+
# @return [Array<Span>]
|
153
|
+
def sents
|
154
|
+
sentence_array = []
|
155
|
+
py_sentences = PyCall::List.(@py_doc.sents)
|
156
|
+
py_sentences.each do |py_sent|
|
157
|
+
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
|
+
end
|
159
|
+
sentence_array
|
160
|
+
end
|
161
|
+
|
162
|
+
# Returns an array of spans each representing a named entity.
|
163
|
+
# @return [Array<Span>]
|
164
|
+
def ents
|
165
|
+
# so that ents canbe "each"-ed in Ruby
|
166
|
+
ent_array = []
|
167
|
+
PyCall::List.(@py_doc.ents).each do |ent|
|
168
|
+
ent.define_singleton_method :label do
|
169
|
+
return self.label_
|
170
|
+
end
|
171
|
+
ent_array << ent
|
172
|
+
end
|
173
|
+
ent_array
|
174
|
+
end
|
175
|
+
|
176
|
+
# Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
|
177
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
178
|
+
def [](range)
|
179
|
+
if range.is_a?(Range)
|
180
|
+
py_span = @py_doc[range]
|
181
|
+
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
182
|
+
else
|
183
|
+
return Token.new(@py_doc[range])
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
# Returns a semantic similarity estimate.
|
188
|
+
# @param other [Doc] the other doc to which a similarity estimation is made
|
189
|
+
# @return [Float]
|
190
|
+
def similarity(other)
|
191
|
+
py_doc.similarity(other.py_doc)
|
192
|
+
end
|
193
|
+
|
194
|
+
# Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
|
195
|
+
# @param style [String] either `dep` or `ent`
|
196
|
+
# @param compact [Boolean] only relevant to the `dep' style
|
197
|
+
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
198
|
+
def displacy(style: "dep", compact: false)
|
199
|
+
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
200
|
+
end
|
201
|
+
|
202
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
203
|
+
def method_missing(name, *args)
|
204
|
+
@py_doc.send(name, *args)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
209
|
+
class Language
|
210
|
+
|
211
|
+
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
212
|
+
attr_reader :spacy_nlp_id
|
213
|
+
|
214
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
215
|
+
attr_reader :py_nlp
|
216
|
+
|
217
|
+
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
218
|
+
# @param model [String] A language model installed in the system
|
219
|
+
def initialize(model = "en_core_web_sm")
|
220
|
+
@spacy_nlp_id = "nlp_#{model.object_id}"
|
221
|
+
PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
|
222
|
+
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
223
|
+
end
|
224
|
+
|
225
|
+
# Reads and analyze the given text.
|
226
|
+
# @param text [String] a text to be read and analyzed
|
227
|
+
def read(text)
|
228
|
+
Doc.new(py_nlp, text: text)
|
229
|
+
end
|
230
|
+
|
231
|
+
# Generates a matcher for the current language model.
|
232
|
+
# @return [Matcher]
|
233
|
+
def matcher
|
234
|
+
Matcher.new(@py_nlp)
|
235
|
+
end
|
236
|
+
|
237
|
+
# A utility method to lookup a vocabulary item of the given id.
|
238
|
+
# @param id [Integer] a vocabulary id
|
239
|
+
# @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
|
240
|
+
def vocab_string_lookup(id)
|
241
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
242
|
+
end
|
243
|
+
|
244
|
+
# A utility method to list pipeline components.
|
245
|
+
# @return [Array<String>] An array of text strings representing pipeline components
|
246
|
+
def pipe_names
|
247
|
+
pipe_array = []
|
248
|
+
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
249
|
+
pipe_array << pipe
|
250
|
+
end
|
251
|
+
pipe_array
|
252
|
+
end
|
253
|
+
|
254
|
+
# A utility method to get a Python `Lexeme` object.
|
255
|
+
# @param text [String] A text string representing a lexeme
|
256
|
+
# @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
|
257
|
+
def get_lexeme(text)
|
258
|
+
@py_nlp.vocab[text]
|
259
|
+
end
|
260
|
+
|
261
|
+
# Returns a ruby lexeme object
|
262
|
+
# @param text [String] a text string representing the vocabulary item
|
263
|
+
# @return [Lexeme]
|
264
|
+
def vocab(text)
|
265
|
+
Lexeme.new(@py_nlp.vocab[text])
|
266
|
+
end
|
267
|
+
|
268
|
+
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
269
|
+
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
270
|
+
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
271
|
+
def most_similar(vector, n)
|
272
|
+
vec_array = Numpy.asarray([vector])
|
273
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
274
|
+
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
|
275
|
+
keys = key_texts.map{|kt| kt[0]}
|
276
|
+
texts = key_texts.map{|kt| kt[1]}
|
277
|
+
best_rows = PyCall::List.(py_result[1])[0]
|
278
|
+
scores = PyCall::List.(py_result[2])[0]
|
279
|
+
|
280
|
+
results = []
|
281
|
+
n.times do |i|
|
282
|
+
result = {key: keys[i].to_i,
|
283
|
+
text: texts[i],
|
284
|
+
best_row: best_rows[i],
|
285
|
+
score: scores[i]
|
286
|
+
}
|
287
|
+
result.each_key do |key|
|
288
|
+
result.define_singleton_method(key){ result[key] }
|
289
|
+
end
|
290
|
+
results << result
|
291
|
+
end
|
292
|
+
results
|
293
|
+
end
|
294
|
+
|
295
|
+
# Utility function to batch process many texts
|
296
|
+
# @param texts [String]
|
297
|
+
# @param disable [Array<String>]
|
298
|
+
# @param batch_size [Integer]
|
299
|
+
# @return [Array<Doc>]
|
300
|
+
def pipe(texts, disable: [], batch_size: 50)
|
301
|
+
docs = []
|
302
|
+
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
303
|
+
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
304
|
+
end
|
305
|
+
docs
|
306
|
+
end
|
307
|
+
|
308
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
|
309
|
+
def method_missing(name, *args)
|
310
|
+
@py_nlp.send(name, *args)
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
315
|
+
class Matcher
|
316
|
+
|
317
|
+
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
318
|
+
attr_reader :py_matcher
|
319
|
+
|
320
|
+
# Creates a {Matcher} instance
|
321
|
+
# @param nlp [Language] an instance of {Language} class
|
322
|
+
def initialize(nlp)
|
323
|
+
@py_matcher = PyMatcher.(nlp.vocab)
|
324
|
+
end
|
325
|
+
|
326
|
+
# Adds a label string and a text pattern.
|
327
|
+
# @param text [String] a label string given to the pattern
|
328
|
+
# @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
|
329
|
+
def add(text, pattern)
|
330
|
+
@py_matcher.add(text, pattern)
|
331
|
+
end
|
332
|
+
|
333
|
+
# Execute the match.
|
334
|
+
# @param doc [Doc] an {Doc} instance
|
335
|
+
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
336
|
+
def match(doc)
|
337
|
+
str_results = @py_matcher.(doc.py_doc).to_s
|
338
|
+
s = StringScanner.new(str_results[1..-2])
|
339
|
+
results = []
|
340
|
+
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
341
|
+
next unless s.matched
|
342
|
+
triple = s.matched.split(", ")
|
343
|
+
match_id = triple[0].to_i
|
344
|
+
start_index = triple[1].to_i
|
345
|
+
end_index = triple[2].to_i - 1
|
346
|
+
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
347
|
+
end
|
348
|
+
results
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
19
352
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
20
353
|
class Span
|
21
354
|
|
22
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
23
|
-
attr_reader :spacy_span_id
|
24
|
-
|
25
355
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
26
356
|
attr_reader :py_span
|
27
357
|
|
@@ -35,21 +365,18 @@ module Spacy
|
|
35
365
|
alias_method :size, :count
|
36
366
|
|
37
367
|
# It is recommended to use {Doc#span} method to create a span. If you need to
|
38
|
-
# create one using {Span#initialize},
|
368
|
+
# create one using {Span#initialize}, there are two method signatures:
|
369
|
+
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
39
370
|
# @param doc [Doc] the document to which this span belongs to
|
40
371
|
# @param start_index [Integer] the index of the item starting the span inside a doc
|
41
372
|
# @param end_index [Integer] the index of the item ending the span inside a doc
|
42
373
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
43
374
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
44
375
|
@doc = doc
|
45
|
-
@spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
|
46
376
|
if py_span
|
47
377
|
@py_span = py_span
|
48
378
|
else
|
49
|
-
|
50
|
-
PyCall.exec("#{@spacy_span_id}_opts = #{options}")
|
51
|
-
PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
|
52
|
-
@py_span = PyCall.eval(@spacy_span_id)
|
379
|
+
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
53
380
|
end
|
54
381
|
end
|
55
382
|
|
@@ -63,7 +390,7 @@ module Spacy
|
|
63
390
|
results
|
64
391
|
end
|
65
392
|
|
66
|
-
# Iterates over the elements in the span yielding a token instance.
|
393
|
+
# Iterates over the elements in the span yielding a token instance each time.
|
67
394
|
def each
|
68
395
|
PyCall::List.(@py_span).each do |py_token|
|
69
396
|
yield Token.new(py_token)
|
@@ -76,18 +403,24 @@ module Spacy
|
|
76
403
|
chunk_array = []
|
77
404
|
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
78
405
|
py_chunks.each do |py_span|
|
79
|
-
chunk_array <<
|
406
|
+
chunk_array << Span.new(@doc, py_span: py_span)
|
80
407
|
end
|
81
408
|
chunk_array
|
82
409
|
end
|
83
410
|
|
411
|
+
# Returns the head token
|
412
|
+
# @return [Token]
|
413
|
+
def root
|
414
|
+
Token.new(@py_span.root)
|
415
|
+
end
|
416
|
+
|
84
417
|
# Returns an array of spans that represents sentences.
|
85
418
|
# @return [Array<Span>]
|
86
419
|
def sents
|
87
420
|
sentence_array = []
|
88
421
|
py_sentences = PyCall::List.(@py_span.sents)
|
89
422
|
py_sentences.each do |py_span|
|
90
|
-
sentence_array <<
|
423
|
+
sentence_array << Span.new(@doc, py_span: py_span)
|
91
424
|
end
|
92
425
|
sentence_array
|
93
426
|
end
|
@@ -97,8 +430,7 @@ module Spacy
|
|
97
430
|
def ents
|
98
431
|
ent_array = []
|
99
432
|
PyCall::List.(@py_span.ents).each do |py_span|
|
100
|
-
|
101
|
-
ent_array << Spacy::Span.new(@doc, py_span: py_span)
|
433
|
+
ent_array << Span.new(@doc, py_span: py_span)
|
102
434
|
end
|
103
435
|
ent_array
|
104
436
|
end
|
@@ -106,18 +438,18 @@ module Spacy
|
|
106
438
|
# Returns a span that represents the sentence that the given span is part of.
|
107
439
|
# @return [Span]
|
108
440
|
def sent
|
109
|
-
py_span
|
110
|
-
return
|
441
|
+
py_span = @py_span.sent
|
442
|
+
return Span.new(@doc, py_span: py_span)
|
111
443
|
end
|
112
444
|
|
113
|
-
# Returns a span if a range object is given
|
445
|
+
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
114
446
|
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
115
447
|
def [](range)
|
116
448
|
if range.is_a?(Range)
|
117
449
|
py_span = @py_span[range]
|
118
|
-
return
|
450
|
+
return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
119
451
|
else
|
120
|
-
return
|
452
|
+
return Token.new(@py_span[range])
|
121
453
|
end
|
122
454
|
end
|
123
455
|
|
@@ -125,31 +457,31 @@ module Spacy
|
|
125
457
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
126
458
|
# @return [Float]
|
127
459
|
def similarity(other)
|
128
|
-
|
460
|
+
py_span.similarity(other.py_span)
|
129
461
|
end
|
130
462
|
|
131
|
-
# Creates a document instance
|
463
|
+
# Creates a document instance from the span
|
132
464
|
# @return [Doc]
|
133
465
|
def as_doc
|
134
|
-
|
466
|
+
Doc.new(@doc.py_nlp, text: self.text)
|
135
467
|
end
|
136
468
|
|
137
|
-
# Returns
|
469
|
+
# Returns tokens conjugated to the root of the span.
|
138
470
|
# @return [Array<Token>] an array of tokens
|
139
471
|
def conjuncts
|
140
472
|
conjunct_array = []
|
141
473
|
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
142
|
-
conjunct_array <<
|
474
|
+
conjunct_array << Token.new(py_conjunct)
|
143
475
|
end
|
144
476
|
conjunct_array
|
145
477
|
end
|
146
478
|
|
147
|
-
# Returns
|
479
|
+
# Returns tokens that are to the left of the span, whose heads are within the span.
|
148
480
|
# @return [Array<Token>] an array of tokens
|
149
481
|
def lefts
|
150
482
|
left_array = []
|
151
483
|
PyCall::List.(@py_span.lefts).each do |py_left|
|
152
|
-
left_array <<
|
484
|
+
left_array << Token.new(py_left)
|
153
485
|
end
|
154
486
|
left_array
|
155
487
|
end
|
@@ -159,7 +491,7 @@ module Spacy
|
|
159
491
|
def rights
|
160
492
|
right_array = []
|
161
493
|
PyCall::List.(@py_span.rights).each do |py_right|
|
162
|
-
right_array <<
|
494
|
+
right_array << Token.new(py_right)
|
163
495
|
end
|
164
496
|
right_array
|
165
497
|
end
|
@@ -169,11 +501,17 @@ module Spacy
|
|
169
501
|
def subtree
|
170
502
|
subtree_array = []
|
171
503
|
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
172
|
-
subtree_array <<
|
504
|
+
subtree_array << Token.new(py_subtree)
|
173
505
|
end
|
174
506
|
subtree_array
|
175
507
|
end
|
176
508
|
|
509
|
+
# Returns the label
|
510
|
+
# @return [String]
|
511
|
+
def label
|
512
|
+
@py_span.label_
|
513
|
+
end
|
514
|
+
|
177
515
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
178
516
|
def method_missing(name, *args)
|
179
517
|
@py_span.send(name, *args)
|
@@ -189,59 +527,67 @@ module Spacy
|
|
189
527
|
# @return [String] a string representing the token
|
190
528
|
attr_reader :text
|
191
529
|
|
192
|
-
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
530
|
+
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
531
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
|
193
532
|
# @param py_token [Object] Python `Token` object
|
194
533
|
def initialize(py_token)
|
195
534
|
@py_token = py_token
|
196
535
|
@text = @py_token.text
|
197
536
|
end
|
198
537
|
|
538
|
+
|
539
|
+
# Returns the head token
|
540
|
+
# @return [Token]
|
541
|
+
def head
|
542
|
+
Token.new(@py_token.head)
|
543
|
+
end
|
544
|
+
|
199
545
|
# Returns the token in question and the tokens that descend from it.
|
200
|
-
# @return [Array<
|
546
|
+
# @return [Array<Token>] an array of tokens
|
201
547
|
def subtree
|
202
548
|
descendant_array = []
|
203
549
|
PyCall::List.(@py_token.subtree).each do |descendant|
|
204
|
-
descendant_array << descendant
|
550
|
+
descendant_array << Token.new(descendant)
|
205
551
|
end
|
206
552
|
descendant_array
|
207
553
|
end
|
208
554
|
|
209
555
|
# Returns the token's ancestors.
|
210
|
-
# @return [Array<
|
556
|
+
# @return [Array<Token>] an array of tokens
|
211
557
|
def ancestors
|
212
558
|
ancestor_array = []
|
213
559
|
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
214
|
-
ancestor_array << ancestor
|
560
|
+
ancestor_array << Token.new(ancestor)
|
215
561
|
end
|
216
562
|
ancestor_array
|
217
563
|
end
|
218
564
|
|
219
565
|
# Returns a sequence of the token's immediate syntactic children.
|
220
|
-
# @return [Array<
|
566
|
+
# @return [Array<Token>] an array of tokens
|
221
567
|
def children
|
222
568
|
child_array = []
|
223
569
|
PyCall::List.(@py_token.children).each do |child|
|
224
|
-
child_array << child
|
570
|
+
child_array << Token.new(child)
|
225
571
|
end
|
226
572
|
child_array
|
227
573
|
end
|
228
574
|
|
229
575
|
# The leftward immediate children of the word in the syntactic dependency parse.
|
230
|
-
# @return [Array<
|
576
|
+
# @return [Array<Token>] an array of tokens
|
231
577
|
def lefts
|
232
578
|
token_array = []
|
233
579
|
PyCall::List.(@py_token.lefts).each do |token|
|
234
|
-
token_array << token
|
580
|
+
token_array << Token.new(token)
|
235
581
|
end
|
236
582
|
token_array
|
237
583
|
end
|
238
584
|
|
239
585
|
# The rightward immediate children of the word in the syntactic dependency parse.
|
240
|
-
# @return [Array<
|
586
|
+
# @return [Array<Token>] an array of tokens
|
241
587
|
def rights
|
242
588
|
token_array = []
|
243
589
|
PyCall::List.(@py_token.rights).each do |token|
|
244
|
-
token_array << token
|
590
|
+
token_array << Token.new(token)
|
245
591
|
end
|
246
592
|
token_array
|
247
593
|
end
|
@@ -252,314 +598,161 @@ module Spacy
|
|
252
598
|
@text
|
253
599
|
end
|
254
600
|
|
255
|
-
#
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
# @return [String] a text string of the document
|
274
|
-
attr_reader :text
|
275
|
-
|
276
|
-
include Enumerable
|
277
|
-
|
278
|
-
alias_method :length, :count
|
279
|
-
alias_method :len, :count
|
280
|
-
alias_method :size, :count
|
281
|
-
|
282
|
-
# Creates a new instance of {Doc}.
|
283
|
-
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
284
|
-
# @param text [String] The text string to be analyzed
|
285
|
-
def initialize(nlp_id, text)
|
286
|
-
@text = text
|
287
|
-
@spacy_nlp_id = nlp_id
|
288
|
-
@spacy_doc_id = "doc_#{text.object_id}"
|
289
|
-
quoted = text.gsub('"', '\"')
|
290
|
-
PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
|
291
|
-
PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
|
292
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
293
|
-
end
|
294
|
-
|
295
|
-
|
296
|
-
# Retokenizes the text merging a span into a single token.
|
297
|
-
# @param start_index [Integer] The start position of the span to be retokenized in the document
|
298
|
-
# @param end_index [Integer] The end position of the span to be retokenized in the document
|
299
|
-
# @param attributes [Hash] Attributes to set on the merged token
|
300
|
-
def retokenize(start_index, end_index, attributes = {})
|
301
|
-
py_attrs = PyCall::Dict.(attributes)
|
302
|
-
PyCall.exec(<<PY)
|
303
|
-
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
304
|
-
retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
|
305
|
-
PY
|
306
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
307
|
-
end
|
308
|
-
|
309
|
-
# Retokenizes the text splitting the specified token.
|
310
|
-
# @param pos_in_doc [Integer] The position of the span to be retokenized in the document
|
311
|
-
# @param split_array [Array<String>] text strings of the split results
|
312
|
-
# @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
|
313
|
-
# @param attributes [Hash] The attributes of the split elements
|
314
|
-
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
315
|
-
py_attrs = PyCall::Dict.(attributes)
|
316
|
-
py_split_array = PyCall::List.(split_array)
|
317
|
-
PyCall.exec(<<PY)
|
318
|
-
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
319
|
-
heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
|
320
|
-
attrs = #{py_attrs}
|
321
|
-
split_array = #{py_split_array}
|
322
|
-
retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
|
323
|
-
PY
|
324
|
-
@py_doc = PyCall.eval(@spacy_doc_id)
|
601
|
+
# Returns a hash or string of morphological information
|
602
|
+
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
603
|
+
# @return [Hash, String]
|
604
|
+
def morphology(hash = true)
|
605
|
+
if @py_token.has_morph
|
606
|
+
morph_analysis = @py_token.morph
|
607
|
+
if hash
|
608
|
+
return morph_analysis.to_dict
|
609
|
+
else
|
610
|
+
return morph_analysis.to_s
|
611
|
+
end
|
612
|
+
else
|
613
|
+
if hash
|
614
|
+
results = {}
|
615
|
+
else
|
616
|
+
return ""
|
617
|
+
end
|
618
|
+
end
|
325
619
|
end
|
326
620
|
|
327
|
-
#
|
621
|
+
# Returns the lemma by calling `lemma_' of `@py_token` object
|
328
622
|
# @return [String]
|
329
|
-
def
|
330
|
-
@
|
623
|
+
def lemma
|
624
|
+
@py_token.lemma_
|
331
625
|
end
|
332
626
|
|
333
|
-
# Returns
|
334
|
-
# @return [
|
335
|
-
def
|
336
|
-
|
337
|
-
PyCall::List.(@py_doc).each do |py_token|
|
338
|
-
results << Token.new(py_token)
|
339
|
-
end
|
340
|
-
results
|
627
|
+
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
628
|
+
# @return [String]
|
629
|
+
def lower
|
630
|
+
@py_token.lower_
|
341
631
|
end
|
342
632
|
|
343
|
-
#
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
end
|
633
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
634
|
+
# @return [String]
|
635
|
+
def shape
|
636
|
+
@py_token.shape_
|
348
637
|
end
|
349
638
|
|
350
|
-
# Returns
|
351
|
-
#
|
352
|
-
|
353
|
-
|
354
|
-
# @return [Span]
|
355
|
-
def span(range_or_start, optional_size = nil)
|
356
|
-
if optional_size
|
357
|
-
start_index = range_or_start
|
358
|
-
temp = tokens[start_index ... start_index + optional_size]
|
359
|
-
else
|
360
|
-
start_index = range_or_start.first
|
361
|
-
range = range_or_start
|
362
|
-
temp = tokens[range]
|
363
|
-
end
|
364
|
-
|
365
|
-
end_index = start_index + temp.size - 1
|
366
|
-
|
367
|
-
Span.new(self, start_index: start_index, end_index: end_index)
|
639
|
+
# Returns the pos by calling `pos_' of `@py_token` object
|
640
|
+
# @return [String]
|
641
|
+
def pos
|
642
|
+
@py_token.pos_
|
368
643
|
end
|
369
644
|
|
370
|
-
# Returns
|
371
|
-
# @return [
|
372
|
-
def
|
373
|
-
|
374
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
375
|
-
py_chunks.each do |py_chunk|
|
376
|
-
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
377
|
-
end
|
378
|
-
chunk_array
|
645
|
+
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
646
|
+
# @return [String]
|
647
|
+
def tag
|
648
|
+
@py_token.tag_
|
379
649
|
end
|
380
650
|
|
381
|
-
# Returns
|
382
|
-
# @return [
|
383
|
-
def
|
384
|
-
|
385
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
386
|
-
py_sentences.each do |py_sent|
|
387
|
-
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
388
|
-
end
|
389
|
-
sentence_array
|
651
|
+
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
652
|
+
# @return [String]
|
653
|
+
def dep
|
654
|
+
@py_token.dep_
|
390
655
|
end
|
391
|
-
|
392
|
-
# Returns
|
393
|
-
# @return [
|
394
|
-
def
|
395
|
-
|
396
|
-
ent_array = []
|
397
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
398
|
-
ent_array << ent
|
399
|
-
end
|
400
|
-
ent_array
|
656
|
+
|
657
|
+
# Returns the language by calling `lang_' of `@py_token` object
|
658
|
+
# @return [String]
|
659
|
+
def lang
|
660
|
+
@py_token.lang_
|
401
661
|
end
|
402
662
|
|
403
|
-
# Returns
|
404
|
-
# @
|
405
|
-
def
|
406
|
-
|
407
|
-
py_span = @py_doc[range]
|
408
|
-
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
409
|
-
else
|
410
|
-
return Token.new(@py_doc[range])
|
411
|
-
end
|
663
|
+
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
664
|
+
# @return [String]
|
665
|
+
def whitespace
|
666
|
+
@py_token.whitespace_
|
412
667
|
end
|
413
668
|
|
414
|
-
# Returns
|
415
|
-
# @
|
416
|
-
|
417
|
-
|
418
|
-
PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
|
669
|
+
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
670
|
+
# @return [String]
|
671
|
+
def ent_type
|
672
|
+
@py_token.ent_type_
|
419
673
|
end
|
420
674
|
|
421
|
-
#
|
422
|
-
# @
|
423
|
-
|
424
|
-
|
425
|
-
def displacy(style: "dep", compact: false)
|
426
|
-
PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
|
675
|
+
# Returns a lexeme object
|
676
|
+
# @return [Lexeme]
|
677
|
+
def lexeme
|
678
|
+
Lexeme.new(@py_token.lex)
|
427
679
|
end
|
428
680
|
|
429
681
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
430
682
|
def method_missing(name, *args)
|
431
|
-
@
|
683
|
+
@py_token.send(name, *args)
|
432
684
|
end
|
433
685
|
end
|
434
686
|
|
435
|
-
# See also spaCy Python API document for [`
|
436
|
-
class
|
687
|
+
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
688
|
+
class Lexeme
|
437
689
|
|
438
|
-
# @return [
|
439
|
-
attr_reader :
|
440
|
-
|
441
|
-
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
442
|
-
attr_reader :py_matcher
|
443
|
-
|
444
|
-
# Creates a {Matcher} instance
|
445
|
-
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
446
|
-
def initialize(nlp_id)
|
447
|
-
@spacy_matcher_id = "doc_#{nlp_id}_matcher"
|
448
|
-
PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
|
449
|
-
@py_matcher = PyCall.eval(@spacy_matcher_id)
|
450
|
-
end
|
690
|
+
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
691
|
+
attr_reader :py_lexeme
|
451
692
|
|
452
|
-
#
|
453
|
-
|
454
|
-
# @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
|
455
|
-
def add(text, pattern)
|
456
|
-
@py_matcher.add(text, pattern)
|
457
|
-
end
|
693
|
+
# @return [String] a string representing the token
|
694
|
+
attr_reader :text
|
458
695
|
|
459
|
-
#
|
460
|
-
#
|
461
|
-
# @
|
462
|
-
def
|
463
|
-
|
464
|
-
|
465
|
-
results = []
|
466
|
-
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
467
|
-
next unless s.matched
|
468
|
-
triple = s.matched.split(", ")
|
469
|
-
match_id = triple[0].to_i
|
470
|
-
start_index = triple[1].to_i
|
471
|
-
end_index = triple[2].to_i - 1
|
472
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
473
|
-
end
|
474
|
-
results
|
696
|
+
# It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
|
697
|
+
# There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
|
698
|
+
# @param py_lexeme [Object] Python `Lexeme` object
|
699
|
+
def initialize(py_lexeme)
|
700
|
+
@py_lexeme = py_lexeme
|
701
|
+
@text = @py_lexeme.text
|
475
702
|
end
|
476
|
-
end
|
477
|
-
|
478
|
-
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
479
|
-
class Language
|
480
|
-
|
481
|
-
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
482
|
-
attr_reader :spacy_nlp_id
|
483
|
-
|
484
|
-
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
485
|
-
attr_reader :py_nlp
|
486
703
|
|
487
|
-
#
|
488
|
-
# @
|
489
|
-
def
|
490
|
-
@
|
491
|
-
PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
|
492
|
-
PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
|
493
|
-
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
704
|
+
# String representation of the token.
|
705
|
+
# @return [String]
|
706
|
+
def to_s
|
707
|
+
@text
|
494
708
|
end
|
495
709
|
|
496
|
-
#
|
497
|
-
# @
|
498
|
-
def
|
499
|
-
|
710
|
+
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
711
|
+
# @return [String]
|
712
|
+
def lower
|
713
|
+
@py_lexeme.lower_
|
500
714
|
end
|
501
715
|
|
502
|
-
#
|
503
|
-
# @return [
|
504
|
-
def
|
505
|
-
|
716
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
717
|
+
# @return [String]
|
718
|
+
def shape
|
719
|
+
@py_lexeme.shape_
|
506
720
|
end
|
507
721
|
|
508
|
-
#
|
509
|
-
# @
|
510
|
-
|
511
|
-
|
512
|
-
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
722
|
+
# Returns the language by calling `lang_' of `@py_lexeme` object
|
723
|
+
# @return [String]
|
724
|
+
def lang
|
725
|
+
@py_lexeme.lang_
|
513
726
|
end
|
514
727
|
|
515
|
-
#
|
516
|
-
# @return [
|
517
|
-
def
|
518
|
-
|
519
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
520
|
-
pipe_array << pipe
|
521
|
-
end
|
522
|
-
pipe_array
|
728
|
+
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
729
|
+
# @return [String]
|
730
|
+
def prefix
|
731
|
+
@py_lexeme.prefix_
|
523
732
|
end
|
524
|
-
|
525
|
-
#
|
526
|
-
# @return [
|
527
|
-
def
|
528
|
-
|
733
|
+
#
|
734
|
+
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
735
|
+
# @return [String]
|
736
|
+
def suffix
|
737
|
+
@py_lexeme.suffix_
|
529
738
|
end
|
530
739
|
|
531
|
-
#
|
532
|
-
# @
|
533
|
-
|
534
|
-
|
535
|
-
text = text.gsub("'", "\'")
|
536
|
-
py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
|
537
|
-
return py_lexeme
|
740
|
+
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
741
|
+
# @return [String]
|
742
|
+
def norm
|
743
|
+
@py_lexeme.norm_
|
538
744
|
end
|
539
745
|
|
540
|
-
# Returns
|
541
|
-
# @param
|
542
|
-
# @return [
|
543
|
-
def
|
544
|
-
|
545
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
546
|
-
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
|
547
|
-
keys = key_texts.map{|kt| kt[0]}
|
548
|
-
texts = key_texts.map{|kt| kt[1]}
|
549
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
550
|
-
scores = PyCall::List.(py_result[2])[0]
|
551
|
-
|
552
|
-
results = []
|
553
|
-
n.times do |i|
|
554
|
-
results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
|
555
|
-
end
|
556
|
-
|
557
|
-
results
|
746
|
+
# Returns a semantic similarity estimate.
|
747
|
+
# @param other [Lexeme] the other doc to which a similarity estimation is made
|
748
|
+
# @return [Float]
|
749
|
+
def similarity(other)
|
750
|
+
@py_lexeme.similarity(other.py_lexeme)
|
558
751
|
end
|
559
752
|
|
560
|
-
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism
|
753
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
561
754
|
def method_missing(name, *args)
|
562
|
-
@
|
755
|
+
@py_lexeme.send(name, *args)
|
563
756
|
end
|
564
757
|
end
|
565
758
|
|