ruby-spacy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +58 -0
  3. data/.yardopts +2 -0
  4. data/Gemfile +18 -0
  5. data/Gemfile.lock +39 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +498 -0
  8. data/Rakefile +12 -0
  9. data/bin/console +15 -0
  10. data/bin/setup +8 -0
  11. data/examples/get_started/lexeme.rb +24 -0
  12. data/examples/get_started/linguistic_annotations.rb +32 -0
  13. data/examples/get_started/most_similar.rb +46 -0
  14. data/examples/get_started/named_entities.rb +24 -0
  15. data/examples/get_started/outputs/test_dep.svg +84 -0
  16. data/examples/get_started/outputs/test_dep_compact.svg +84 -0
  17. data/examples/get_started/outputs/test_ent.html +11 -0
  18. data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
  19. data/examples/get_started/similarity.rb +13 -0
  20. data/examples/get_started/tokenization.rb +22 -0
  21. data/examples/get_started/visualizing_dependencies.rb +14 -0
  22. data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
  23. data/examples/get_started/visualizing_named_entities.rb +12 -0
  24. data/examples/get_started/vocab.rb +10 -0
  25. data/examples/get_started/word_vectors.rb +24 -0
  26. data/examples/japanese/ancestors.rb +44 -0
  27. data/examples/japanese/entity_annotations_and_labels.rb +45 -0
  28. data/examples/japanese/information_extraction.rb +27 -0
  29. data/examples/japanese/lemmatization.rb +32 -0
  30. data/examples/japanese/most_similar.rb +46 -0
  31. data/examples/japanese/named_entity_recognition.rb +27 -0
  32. data/examples/japanese/navigating_parse_tree.rb +34 -0
  33. data/examples/japanese/noun_chunks.rb +23 -0
  34. data/examples/japanese/outputs/test_dep.svg +149 -0
  35. data/examples/japanese/outputs/test_ent.html +16 -0
  36. data/examples/japanese/pos_tagging.rb +34 -0
  37. data/examples/japanese/sentence_segmentation.rb +16 -0
  38. data/examples/japanese/similarity.rb +12 -0
  39. data/examples/japanese/tokenization.rb +38 -0
  40. data/examples/japanese/visualizing_dependencies.rb +13 -0
  41. data/examples/japanese/visualizing_named_entities.rb +14 -0
  42. data/examples/linguistic_features/ancestors.rb +41 -0
  43. data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
  44. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
  45. data/examples/linguistic_features/information_extraction.rb +36 -0
  46. data/examples/linguistic_features/iterating_children.rb +24 -0
  47. data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
  48. data/examples/linguistic_features/lemmatization.rb +31 -0
  49. data/examples/linguistic_features/morphology.rb +17 -0
  50. data/examples/linguistic_features/named_entity_recognition.rb +25 -0
  51. data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
  52. data/examples/linguistic_features/noun_chunks.rb +27 -0
  53. data/examples/linguistic_features/outputs/test_ent.html +11 -0
  54. data/examples/linguistic_features/pos_tagging.rb +31 -0
  55. data/examples/linguistic_features/retokenize_1.rb +29 -0
  56. data/examples/linguistic_features/retokenize_2.rb +16 -0
  57. data/examples/linguistic_features/rule_based_morphology.rb +12 -0
  58. data/examples/linguistic_features/sentence_segmentation.rb +16 -0
  59. data/examples/linguistic_features/similarity.rb +14 -0
  60. data/examples/linguistic_features/similarity_between_spans.rb +23 -0
  61. data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
  62. data/examples/linguistic_features/tokenization.rb +23 -0
  63. data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
  64. data/examples/rule_based_matching/matcher.rb +19 -0
  65. data/lib/ruby-spacy.rb +567 -0
  66. data/lib/ruby-spacy/version.rb +6 -0
  67. data/ruby-spacy.gemspec +42 -0
  68. metadata +157 -0
@@ -0,0 +1,27 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ lemmatizer = nlp.get_pipe("lemmatizer")
7
+ puts "Lemmatizer mode: " + lemmatizer.mode
8
+
9
+ doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
10
+
11
+ headings = ["text", "root.text", "root.dep", "root.head.text"]
12
+ rows = []
13
+
14
+ doc.noun_chunks.each do |chunk|
15
+ rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # +---------------------+---------------+----------+----------------+
22
+ # | text | root.text | root.dep | root.head.text |
23
+ # +---------------------+---------------+----------+----------------+
24
+ # | Autonomous cars | cars | nsubj | shift |
25
+ # | insurance liability | liability | dobj | shift |
26
+ # | manufacturers | manufacturers | pobj | toward |
27
+ # +---------------------+---------------+----------+----------------+
@@ -0,0 +1,11 @@
1
+ <div class="entities" style="line-height: 2.5; direction: ltr">When
2
+ <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
3
+ Sebastian Thrun
4
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
5
+ </mark>
6
+ started working on self-driving cars at Google in
7
+ <mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
8
+ 2007
9
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>
10
+ </mark>
11
+ , few people outside of the company took him seriously.</div>
@@ -0,0 +1,31 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
6
+
7
+ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
8
+ rows = []
9
+
10
+ doc.each do |token|
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
12
+ end
13
+
14
+ table = Terminal::Table.new rows: rows, headings: headings
15
+ puts table
16
+
17
+ # +---------+---------+-------+-----+----------+-------+----------+---------+
18
+ # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
+ # +---------+---------+-------+-----+----------+-------+----------+---------+
20
+ # | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
21
+ # | is | be | AUX | VBZ | aux | xx | true | true |
22
+ # | looking | look | VERB | VBG | ROOT | xxxx | true | false |
23
+ # | at | at | ADP | IN | prep | xx | true | true |
24
+ # | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
25
+ # | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
26
+ # | startup | startup | NOUN | NN | advcl | xxxx | true | false |
27
+ # | for | for | ADP | IN | prep | xxx | true | true |
28
+ # | $ | $ | SYM | $ | quantmod | $ | false | false |
29
+ # | 1 | 1 | NUM | CD | compound | d | false | false |
30
+ # | billion | billion | NUM | CD | pobj | xxxx | true | false |
31
+ # +---------+---------+-------+-----+----------+-------+----------+---------+
@@ -0,0 +1,29 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "Credit and mortgage account holders must submit their requests"
7
+ doc = nlp.read(sentence)
8
+
9
+ headings = ["text", "pos", "dep", "head text"]
10
+ rows = []
11
+
12
+ doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
13
+
14
+ doc.each do |token|
15
+ rows << [token.text, token.pos_, token.dep_, token.head.text]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # +-------------------------------------+------+-------+-----------+
22
+ # | text | pos | dep | head text |
23
+ # +-------------------------------------+------+-------+-----------+
24
+ # | Credit and mortgage account holders | NOUN | nsubj | submit |
25
+ # | must | AUX | aux | submit |
26
+ # | submit | VERB | ROOT | submit |
27
+ # | their | PRON | poss | requests |
28
+ # | requests | NOUN | dobj | submit |
29
+ # +-------------------------------------+------+-------+-----------+
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "I live in New York"
7
+ doc = nlp.read(sentence)
8
+
9
+ puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
10
+
11
+ doc.retokenize(3, 4)
12
+
13
+ puts "After: " + doc.tokens.collect{|t| t}.join(", ")
14
+
15
+ # Before: I, live, in, New, York
16
+ # After: I, live, in, New York
@@ -0,0 +1,12 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("Where are you?")
7
+
8
+ puts "Morph features of the third word: " + doc[2].morph.to_s
9
+ puts "POS of the third word: " + doc[2].pos_.to_s
10
+
11
+ # Morph features of the third word: Case=Nom|Person=2|PronType=Prs
12
+ # POS of the third word: PRON
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+
3
+ nlp = Spacy::Language.new("en_core_web_sm")
4
+
5
+ doc = nlp.read("This is a sentence. This is another sentence.")
6
+
7
+
8
+ puts "doc has annotation SENT_START: " + doc.has_annotation("SENT_START").to_s
9
+
10
+ doc.sents.each do |sent|
11
+ puts sent.text
12
+ end
13
+
14
+ # doc has annotation SENT_START: true
15
+ # This is a sentence.
16
+ # This is another sentence.
@@ -0,0 +1,14 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+ doc1 = nlp.read("I like salty fries and hamburgers.")
6
+ doc2 = nlp.read("Fast food tastes very good.")
7
+
8
+ puts "Doc 1: " + doc1
9
+ puts "Doc 2: " + doc2
10
+ puts "Similarity: #{doc1.similarity(doc2)}"
11
+
12
+ # Doc 1: I like salty fries and hamburgers.
13
+ # Doc 2: Fast food tastes very good.
14
+ # Similarity: 0.7687607012190486
@@ -0,0 +1,23 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+ doc1 = nlp.read("I like salty fries and hamburgers.")
6
+ doc2 = nlp.read("Fast food tastes very good.")
7
+
8
+ puts "Doc 1: " + doc1
9
+ puts "Doc 2: " + doc2
10
+ puts "Similarity: #{doc1.similarity(doc2)}"
11
+
12
+ span1 = doc1.span(2, 2) # salty fries
13
+ span2 = doc1.span(5 .. 5) # hamberger
14
+ puts "Span 1: " + span1.text
15
+ puts "Span 2: " + span2.text
16
+ puts "Similarity: #{span1.similarity(span2)}"
17
+
18
+ # Doc 1: I like salty fries and hamburgers.
19
+ # Doc 2: Fast food tastes very good.
20
+ # Similarity: 0.7687607012190486
21
+ # Span 1: salty fries
22
+ # Span 2: hamburgers
23
+ # Similarity: 0.6949787735939026
@@ -0,0 +1,19 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("gimme that")
7
+
8
+ puts doc.tokens.join(" ")
9
+
10
+ # Add special case rule
11
+ special_case = [{ORTH: "gim"}, {ORTH: "me"}]
12
+ tokenizer = nlp.tokenizer
13
+ tokenizer.add_special_case("gimme", special_case)
14
+
15
+ # Check new tokenization
16
+ puts nlp.read("gimme that").tokens.join(" ")
17
+
18
+ # gimme that
19
+ # gim me that
@@ -0,0 +1,23 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
7
+
8
+ headings = [1,2,3,4,5,6,7,8,9,10,11]
9
+ row = []
10
+
11
+ doc.each do |token|
12
+ row << token.text
13
+ end
14
+
15
+ table = Terminal::Table.new rows: [row], headings: headings
16
+ puts table
17
+
18
+ # +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
19
+ # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
20
+ # +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
21
+ # | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
22
+ # +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
23
+
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+ matcher = nlp.matcher
6
+ matcher.add("US_PRESIDENT", [[{LOWER: "barack"}, {LOWER: "obama"}]])
7
+ doc = nlp.read("Barack Obama was the 44th president of the United States")
8
+
9
+ matches = matcher.match(doc)
10
+
11
+ matches.each do |match|
12
+ span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
13
+ puts span.text + " / " + span.label_
14
+ end
15
+
16
+ # Barack Obama / US_PRESIDENT
@@ -0,0 +1,19 @@
1
+ require "ruby-spacy"
2
+
3
+ nlp = Spacy::Language.new("en_core_web_sm")
4
+
5
+ pattern = [[{LOWER: "hello"}, {IS_PUNCT: true}, {LOWER: "world"}]]
6
+
7
+ matcher = nlp.matcher
8
+ matcher.add("HelloWorld", pattern)
9
+
10
+ doc = nlp.read("Hello, world! Hello world!")
11
+ matches = matcher.match(doc)
12
+
13
+ matches.each do | match |
14
+ string_id = nlp.vocab_string_lookup(match[:match_id])
15
+ span = doc.span(match[:start_index]..match[:end_index])
16
+ puts "#{string_id}, #{span.text}"
17
+ end
18
+
19
+ # HelloWorld, Hello, world
data/lib/ruby-spacy.rb ADDED
@@ -0,0 +1,567 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ruby-spacy/version"
4
+ require 'enumerator'
5
+ require 'strscan'
6
+ require 'pycall/import'
7
+ require 'numpy'
8
+ include PyCall::Import
9
+
10
+ # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
11
+ module Spacy
12
+ # A utility module method to convert Python's generator object to a Ruby array,
13
+ # mainly used on the items inside the array returned from dependency-related methods
14
+ # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
15
+ def self.generator_to_array(py_generator)
16
+ PyCall::List.(py_generator)
17
+ end
18
+
19
+ # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
20
+ class Span
21
+
22
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
23
+ attr_reader :spacy_span_id
24
+
25
+ # @return [Object] a Python `Span` instance accessible via `PyCall`
26
+ attr_reader :py_span
27
+
28
+ # @return [Doc] the document to which the span belongs
29
+ attr_reader :doc
30
+
31
+ include Enumerable
32
+
33
+ alias_method :length, :count
34
+ alias_method :len, :count
35
+ alias_method :size, :count
36
+
37
+ # It is recommended to use {Doc#span} method to create a span. If you need to
38
+ # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
39
+ # @param doc [Doc] the document to which this span belongs to
40
+ # @param start_index [Integer] the index of the item starting the span inside a doc
41
+ # @param end_index [Integer] the index of the item ending the span inside a doc
42
+ # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
43
+ def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
44
+ @doc = doc
45
+ @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
46
+ if py_span
47
+ @py_span = py_span
48
+ else
49
+ options = PyCall::Dict.(options)
50
+ PyCall.exec("#{@spacy_span_id}_opts = #{options}")
51
+ PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
52
+ @py_span = PyCall.eval(@spacy_span_id)
53
+ end
54
+ end
55
+
56
+ # Returns an array of tokens contained in the span.
57
+ # @return [Array<Token>]
58
+ def tokens
59
+ results = []
60
+ PyCall::List.(@py_span).each do |py_token|
61
+ results << Token.new(py_token)
62
+ end
63
+ results
64
+ end
65
+
66
+ # Iterates over the elements in the span yielding a token instance.
67
+ def each
68
+ PyCall::List.(@py_span).each do |py_token|
69
+ yield Token.new(py_token)
70
+ end
71
+ end
72
+
73
+ # Returns an array of spans of noun chunks.
74
+ # @return [Array<Span>]
75
+ def noun_chunks
76
+ chunk_array = []
77
+ py_chunks = PyCall::List.(@py_span.noun_chunks)
78
+ py_chunks.each do |py_span|
79
+ chunk_array << Spacy::Span.new(@doc, py_span: py_span)
80
+ end
81
+ chunk_array
82
+ end
83
+
84
+ # Returns an array of spans that represents sentences.
85
+ # @return [Array<Span>]
86
+ def sents
87
+ sentence_array = []
88
+ py_sentences = PyCall::List.(@py_span.sents)
89
+ py_sentences.each do |py_span|
90
+ sentence_array << Spacy::Span.new(@doc, py_span: py_span)
91
+ end
92
+ sentence_array
93
+ end
94
+
95
+ # Returns an array of spans that represents named entities.
96
+ # @return [Array<Span>]
97
+ def ents
98
+ ent_array = []
99
+ PyCall::List.(@py_span.ents).each do |py_span|
100
+ # ent_array << ent
101
+ ent_array << Spacy::Span.new(@doc, py_span: py_span)
102
+ end
103
+ ent_array
104
+ end
105
+
106
+ # Returns a span that represents the sentence that the given span is part of.
107
+ # @return [Span]
108
+ def sent
109
+ py_span =@py_span.sent
110
+ return Spacy::Span.new(@doc, py_span: py_span)
111
+ end
112
+
113
+ # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
114
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
115
+ def [](range)
116
+ if range.is_a?(Range)
117
+ py_span = @py_span[range]
118
+ return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
119
+ else
120
+ return Spacy::Token.new(@py_span[range])
121
+ end
122
+ end
123
+
124
+ # Returns a semantic similarity estimate.
125
+ # @param other [Span] the other span to which a similarity estimation is conducted
126
+ # @return [Float]
127
+ def similarity(other)
128
+ PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
129
+ end
130
+
131
+ # Creates a document instance
132
+ # @return [Doc]
133
+ def as_doc
134
+ Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
135
+ end
136
+
137
+ # Returns Tokens conjugated to the root of the span.
138
+ # @return [Array<Token>] an array of tokens
139
+ def conjuncts
140
+ conjunct_array = []
141
+ PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
142
+ conjunct_array << Spacy::Token.new(py_conjunct)
143
+ end
144
+ conjunct_array
145
+ end
146
+
147
+ # Returns Tokens that are to the left of the span, whose heads are within the span.
148
+ # @return [Array<Token>] an array of tokens
149
+ def lefts
150
+ left_array = []
151
+ PyCall::List.(@py_span.lefts).each do |py_left|
152
+ left_array << Spacy::Token.new(py_left)
153
+ end
154
+ left_array
155
+ end
156
+
157
+ # Returns Tokens that are to the right of the span, whose heads are within the span.
158
+ # @return [Array<Token>] an array of Tokens
159
+ def rights
160
+ right_array = []
161
+ PyCall::List.(@py_span.rights).each do |py_right|
162
+ right_array << Spacy::Token.new(py_right)
163
+ end
164
+ right_array
165
+ end
166
+
167
+ # Returns Tokens that are within the span and tokens that descend from them.
168
+ # @return [Array<Token>] an array of tokens
169
+ def subtree
170
+ subtree_array = []
171
+ PyCall::List.(@py_span.subtree).each do |py_subtree|
172
+ subtree_array << Spacy::Token.new(py_subtree)
173
+ end
174
+ subtree_array
175
+ end
176
+
177
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
178
+ def method_missing(name, *args)
179
+ @py_span.send(name, *args)
180
+ end
181
+ end
182
+
183
+ # See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
184
+ class Token
185
+
186
+ # @return [Object] a Python `Token` instance accessible via `PyCall`
187
+ attr_reader :py_token
188
+
189
+ # @return [String] a string representing the token
190
+ attr_reader :text
191
+
192
+ # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
193
+ # @param py_token [Object] Python `Token` object
194
+ def initialize(py_token)
195
+ @py_token = py_token
196
+ @text = @py_token.text
197
+ end
198
+
199
+ # Returns the token in question and the tokens that descend from it.
200
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
201
+ def subtree
202
+ descendant_array = []
203
+ PyCall::List.(@py_token.subtree).each do |descendant|
204
+ descendant_array << descendant
205
+ end
206
+ descendant_array
207
+ end
208
+
209
+ # Returns the token's ancestors.
210
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
211
+ def ancestors
212
+ ancestor_array = []
213
+ PyCall::List.(@py_token.ancestors).each do |ancestor|
214
+ ancestor_array << ancestor
215
+ end
216
+ ancestor_array
217
+ end
218
+
219
+ # Returns a sequence of the token's immediate syntactic children.
220
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
221
+ def children
222
+ child_array = []
223
+ PyCall::List.(@py_token.children).each do |child|
224
+ child_array << child
225
+ end
226
+ child_array
227
+ end
228
+
229
+ # The leftward immediate children of the word in the syntactic dependency parse.
230
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
231
+ def lefts
232
+ token_array = []
233
+ PyCall::List.(@py_token.lefts).each do |token|
234
+ token_array << token
235
+ end
236
+ token_array
237
+ end
238
+
239
+ # The rightward immediate children of the word in the syntactic dependency parse.
240
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
241
+ def rights
242
+ token_array = []
243
+ PyCall::List.(@py_token.rights).each do |token|
244
+ token_array << token
245
+ end
246
+ token_array
247
+ end
248
+
249
+ # String representation of the token.
250
+ # @return [String]
251
+ def to_s
252
+ @text
253
+ end
254
+
255
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
256
+ def method_missing(name, *args)
257
+ @py_token.send(name, *args)
258
+ end
259
+ end
260
+
261
+ # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
262
+ class Doc
263
+
264
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
265
+ attr_reader :spacy_nlp_id
266
+
267
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
268
+ attr_reader :spacy_doc_id
269
+
270
+ # @return [Object] a Python `Doc` instance accessible via `PyCall`
271
+ attr_reader :py_doc
272
+
273
+ # @return [String] a text string of the document
274
+ attr_reader :text
275
+
276
+ include Enumerable
277
+
278
+ alias_method :length, :count
279
+ alias_method :len, :count
280
+ alias_method :size, :count
281
+
282
+ # Creates a new instance of {Doc}.
283
+ # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
284
+ # @param text [String] The text string to be analyzed
285
+ def initialize(nlp_id, text)
286
+ @text = text
287
+ @spacy_nlp_id = nlp_id
288
+ @spacy_doc_id = "doc_#{text.object_id}"
289
+ quoted = text.gsub('"', '\"')
290
+ PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
291
+ PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
292
+ @py_doc = PyCall.eval(@spacy_doc_id)
293
+ end
294
+
295
+
296
+ # Retokenizes the text merging a span into a single token.
297
+ # @param start_index [Integer] The start position of the span to be retokenized in the document
298
+ # @param end_index [Integer] The end position of the span to be retokenized in the document
299
+ # @param attributes [Hash] Attributes to set on the merged token
300
+ def retokenize(start_index, end_index, attributes = {})
301
+ py_attrs = PyCall::Dict.(attributes)
302
+ PyCall.exec(<<PY)
303
+ with #{@spacy_doc_id}.retokenize() as retokenizer:
304
+ retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
305
+ PY
306
+ @py_doc = PyCall.eval(@spacy_doc_id)
307
+ end
308
+
309
+ # Retokenizes the text splitting the specified token.
310
+ # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
311
+ # @param split_array [Array<String>] text strings of the split results
312
+ # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
313
+ # @param attributes [Hash] The attributes of the split elements
314
+ def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
315
+ py_attrs = PyCall::Dict.(attributes)
316
+ py_split_array = PyCall::List.(split_array)
317
+ PyCall.exec(<<PY)
318
+ with #{@spacy_doc_id}.retokenize() as retokenizer:
319
+ heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
320
+ attrs = #{py_attrs}
321
+ split_array = #{py_split_array}
322
+ retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
323
+ PY
324
+ @py_doc = PyCall.eval(@spacy_doc_id)
325
+ end
326
+
327
+ # String representation of the token.
328
+ # @return [String]
329
+ def to_s
330
+ @text
331
+ end
332
+
333
+ # Returns an array of tokens contained in the doc.
334
+ # @return [Array<Token>]
335
+ def tokens
336
+ results = []
337
+ PyCall::List.(@py_doc).each do |py_token|
338
+ results << Token.new(py_token)
339
+ end
340
+ results
341
+ end
342
+
343
+ # Iterates over the elements in the doc yielding a token instance.
344
+ def each
345
+ PyCall::List.(@py_doc).each do |py_token|
346
+ yield Token.new(py_token)
347
+ end
348
+ end
349
+
350
+ # Returns a span of the specified range within the doc.
351
+ # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
352
+ # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
353
+ # @param optional_size [Integer] An integer representing the size of the span
354
+ # @return [Span]
355
+ def span(range_or_start, optional_size = nil)
356
+ if optional_size
357
+ start_index = range_or_start
358
+ temp = tokens[start_index ... start_index + optional_size]
359
+ else
360
+ start_index = range_or_start.first
361
+ range = range_or_start
362
+ temp = tokens[range]
363
+ end
364
+
365
+ end_index = start_index + temp.size - 1
366
+
367
+ Span.new(self, start_index: start_index, end_index: end_index)
368
+ end
369
+
370
+ # Returns an array of spans representing noun chunks.
371
+ # @return [Array<Span>]
372
+ def noun_chunks
373
+ chunk_array = []
374
+ py_chunks = PyCall::List.(@py_doc.noun_chunks)
375
+ py_chunks.each do |py_chunk|
376
+ chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
377
+ end
378
+ chunk_array
379
+ end
380
+
381
+ # Returns an array of spans representing sentences.
382
+ # @return [Array<Span>]
383
+ def sents
384
+ sentence_array = []
385
+ py_sentences = PyCall::List.(@py_doc.sents)
386
+ py_sentences.each do |py_sent|
387
+ sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
388
+ end
389
+ sentence_array
390
+ end
391
+
392
+ # Returns an array of spans representing named entities.
393
+ # @return [Array<Span>]
394
+ def ents
395
+ # so that ents canbe "each"-ed in Ruby
396
+ ent_array = []
397
+ PyCall::List.(@py_doc.ents).each do |ent|
398
+ ent_array << ent
399
+ end
400
+ ent_array
401
+ end
402
+
403
+ # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
404
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
405
+ def [](range)
406
+ if range.is_a?(Range)
407
+ py_span = @py_doc[range]
408
+ return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
409
+ else
410
+ return Token.new(@py_doc[range])
411
+ end
412
+ end
413
+
414
+ # Returns a semantic similarity estimate.
415
+ # @param other [Doc] the other doc to which a similarity estimation is made
416
+ # @return [Float]
417
+ def similarity(other)
418
+ PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
419
+ end
420
+
421
+ # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
422
+ # @param style [String] Either `dep` or `ent`
423
+ # @param compact [Boolean] Only relevant to the `dep' style
424
+ # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
425
+ def displacy(style: "dep", compact: false)
426
+ PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
427
+ end
428
+
429
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
430
+ def method_missing(name, *args)
431
+ @py_doc.send(name, *args)
432
+ end
433
+ end
434
+
435
+ # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
436
+ class Matcher
437
+
438
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
439
+ attr_reader :spacy_matcher_id
440
+
441
+ # @return [Object] a Python `Matcher` instance accessible via `PyCall`
442
+ attr_reader :py_matcher
443
+
444
+ # Creates a {Matcher} instance
445
+ # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
446
+ def initialize(nlp_id)
447
+ @spacy_matcher_id = "doc_#{nlp_id}_matcher"
448
+ PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
449
+ @py_matcher = PyCall.eval(@spacy_matcher_id)
450
+ end
451
+
452
+ # Adds a label string and a text pattern.
453
+ # @param text [String] a label string given to the pattern
454
+ # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
455
+ def add(text, pattern)
456
+ @py_matcher.add(text, pattern)
457
+ end
458
+
459
+ # Execute the match.
460
+ # @param doc [Doc] An {Doc} instance
461
+ # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
462
+ def match(doc)
463
+ str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
464
+ s = StringScanner.new(str_results[1..-2])
465
+ results = []
466
+ while s.scan_until(/(\d+), (\d+), (\d+)/)
467
+ next unless s.matched
468
+ triple = s.matched.split(", ")
469
+ match_id = triple[0].to_i
470
+ start_index = triple[1].to_i
471
+ end_index = triple[2].to_i - 1
472
+ results << {match_id: match_id, start_index: start_index, end_index: end_index}
473
+ end
474
+ results
475
+ end
476
+ end
477
+
478
+ # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
479
+ class Language
480
+
481
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
482
+ attr_reader :spacy_nlp_id
483
+
484
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
485
+ attr_reader :py_nlp
486
+
487
+ # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
488
+ # @param model [String] A language model installed in the system
489
+ def initialize(model = "en_core_web_sm")
490
+ @spacy_nlp_id = "nlp_#{model.object_id}"
491
+ PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
492
+ PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
493
+ @py_nlp = PyCall.eval(@spacy_nlp_id)
494
+ end
495
+
496
+ # Reads and analyze the given text.
497
+ # @param text [String] A text to be read and analyzed
498
+ def read(text)
499
+ Doc.new(@spacy_nlp_id, text)
500
+ end
501
+
502
+ # Generates a matcher for the current language model.
503
+ # @return [Matcher]
504
+ def matcher
505
+ Matcher.new(@spacy_nlp_id)
506
+ end
507
+
508
+ # A utility method to lookup a vocabulary item of the given id.
509
+ # @param id [Integer] A vocabulary id
510
+ # @return [Object] A Python `Lexeme` object
511
+ def vocab_string_lookup(id)
512
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
513
+ end
514
+
515
+ # A utility method to list pipeline components.
516
+ # @return [Array<String>] An array of text strings representing pipeline components
517
+ def pipe_names
518
+ pipe_array = []
519
+ PyCall::List.(@py_nlp.pipe_names).each do |pipe|
520
+ pipe_array << pipe
521
+ end
522
+ pipe_array
523
+ end
524
+
525
+ # A utility method to get the tokenizer Python object.
526
+ # @return [Object] Python `Tokenizer` object
527
+ def tokenizer
528
+ return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
529
+ end
530
+
531
+ # A utility method to get a Python `Lexeme` object.
532
+ # @param text [String] A text string representing a lexeme
533
+ # @return [Object] Python `Tokenizer` object
534
+ def get_lexeme(text)
535
+ text = text.gsub("'", "\'")
536
+ py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
537
+ return py_lexeme
538
+ end
539
+
540
+ # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
541
+ # @param vector [Object] A vector representation of a word (whether existing or non-existing)
542
+ # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
543
+ def most_similar(vector, n)
544
+ vec_array = Numpy.asarray([vector])
545
+ py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
546
+ key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
547
+ keys = key_texts.map{|kt| kt[0]}
548
+ texts = key_texts.map{|kt| kt[1]}
549
+ best_rows = PyCall::List.(py_result[1])[0]
550
+ scores = PyCall::List.(py_result[2])[0]
551
+
552
+ results = []
553
+ n.times do |i|
554
+ results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
555
+ end
556
+
557
+ results
558
+ end
559
+
560
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
561
+ def method_missing(name, *args)
562
+ @py_nlp.send(name, *args)
563
+ end
564
+ end
565
+
566
+ end
567
+