ruby-spacy 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +58 -0
- data/.yardopts +2 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +39 -0
- data/LICENSE.txt +21 -0
- data/README.md +498 -0
- data/Rakefile +12 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/examples/get_started/lexeme.rb +24 -0
- data/examples/get_started/linguistic_annotations.rb +32 -0
- data/examples/get_started/most_similar.rb +46 -0
- data/examples/get_started/named_entities.rb +24 -0
- data/examples/get_started/outputs/test_dep.svg +84 -0
- data/examples/get_started/outputs/test_dep_compact.svg +84 -0
- data/examples/get_started/outputs/test_ent.html +11 -0
- data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
- data/examples/get_started/similarity.rb +13 -0
- data/examples/get_started/tokenization.rb +22 -0
- data/examples/get_started/visualizing_dependencies.rb +14 -0
- data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
- data/examples/get_started/visualizing_named_entities.rb +12 -0
- data/examples/get_started/vocab.rb +10 -0
- data/examples/get_started/word_vectors.rb +24 -0
- data/examples/japanese/ancestors.rb +44 -0
- data/examples/japanese/entity_annotations_and_labels.rb +45 -0
- data/examples/japanese/information_extraction.rb +27 -0
- data/examples/japanese/lemmatization.rb +32 -0
- data/examples/japanese/most_similar.rb +46 -0
- data/examples/japanese/named_entity_recognition.rb +27 -0
- data/examples/japanese/navigating_parse_tree.rb +34 -0
- data/examples/japanese/noun_chunks.rb +23 -0
- data/examples/japanese/outputs/test_dep.svg +149 -0
- data/examples/japanese/outputs/test_ent.html +16 -0
- data/examples/japanese/pos_tagging.rb +34 -0
- data/examples/japanese/sentence_segmentation.rb +16 -0
- data/examples/japanese/similarity.rb +12 -0
- data/examples/japanese/tokenization.rb +38 -0
- data/examples/japanese/visualizing_dependencies.rb +13 -0
- data/examples/japanese/visualizing_named_entities.rb +14 -0
- data/examples/linguistic_features/ancestors.rb +41 -0
- data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
- data/examples/linguistic_features/information_extraction.rb +36 -0
- data/examples/linguistic_features/iterating_children.rb +24 -0
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
- data/examples/linguistic_features/lemmatization.rb +31 -0
- data/examples/linguistic_features/morphology.rb +17 -0
- data/examples/linguistic_features/named_entity_recognition.rb +25 -0
- data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
- data/examples/linguistic_features/noun_chunks.rb +27 -0
- data/examples/linguistic_features/outputs/test_ent.html +11 -0
- data/examples/linguistic_features/pos_tagging.rb +31 -0
- data/examples/linguistic_features/retokenize_1.rb +29 -0
- data/examples/linguistic_features/retokenize_2.rb +16 -0
- data/examples/linguistic_features/rule_based_morphology.rb +12 -0
- data/examples/linguistic_features/sentence_segmentation.rb +16 -0
- data/examples/linguistic_features/similarity.rb +14 -0
- data/examples/linguistic_features/similarity_between_spans.rb +23 -0
- data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
- data/examples/linguistic_features/tokenization.rb +23 -0
- data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
- data/examples/rule_based_matching/matcher.rb +19 -0
- data/lib/ruby-spacy.rb +567 -0
- data/lib/ruby-spacy/version.rb +6 -0
- data/ruby-spacy.gemspec +42 -0
- metadata +157 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
lemmatizer = nlp.get_pipe("lemmatizer")
|
7
|
+
puts "Lemmatizer mode: " + lemmatizer.mode
|
8
|
+
|
9
|
+
doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
|
10
|
+
|
11
|
+
headings = ["text", "root.text", "root.dep", "root.head.text"]
|
12
|
+
rows = []
|
13
|
+
|
14
|
+
doc.noun_chunks.each do |chunk|
|
15
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
|
16
|
+
end
|
17
|
+
|
18
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
19
|
+
puts table
|
20
|
+
|
21
|
+
# +---------------------+---------------+----------+----------------+
|
22
|
+
# | text | root.text | root.dep | root.head.text |
|
23
|
+
# +---------------------+---------------+----------+----------------+
|
24
|
+
# | Autonomous cars | cars | nsubj | shift |
|
25
|
+
# | insurance liability | liability | dobj | shift |
|
26
|
+
# | manufacturers | manufacturers | pobj | toward |
|
27
|
+
# +---------------------+---------------+----------+----------------+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<div class="entities" style="line-height: 2.5; direction: ltr">When
|
2
|
+
<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
3
|
+
Sebastian Thrun
|
4
|
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
|
5
|
+
</mark>
|
6
|
+
started working on self-driving cars at Google in
|
7
|
+
<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
8
|
+
2007
|
9
|
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>
|
10
|
+
</mark>
|
11
|
+
, few people outside of the company took him seriously.</div>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
6
|
+
|
7
|
+
headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
|
8
|
+
rows = []
|
9
|
+
|
10
|
+
doc.each do |token|
|
11
|
+
rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
|
12
|
+
end
|
13
|
+
|
14
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
15
|
+
puts table
|
16
|
+
|
17
|
+
# +---------+---------+-------+-----+----------+-------+----------+---------+
|
18
|
+
# | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
|
19
|
+
# +---------+---------+-------+-----+----------+-------+----------+---------+
|
20
|
+
# | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
|
21
|
+
# | is | be | AUX | VBZ | aux | xx | true | true |
|
22
|
+
# | looking | look | VERB | VBG | ROOT | xxxx | true | false |
|
23
|
+
# | at | at | ADP | IN | prep | xx | true | true |
|
24
|
+
# | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
|
25
|
+
# | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
|
26
|
+
# | startup | startup | NOUN | NN | advcl | xxxx | true | false |
|
27
|
+
# | for | for | ADP | IN | prep | xxx | true | true |
|
28
|
+
# | $ | $ | SYM | $ | quantmod | $ | false | false |
|
29
|
+
# | 1 | 1 | NUM | CD | compound | d | false | false |
|
30
|
+
# | billion | billion | NUM | CD | pobj | xxxx | true | false |
|
31
|
+
# +---------+---------+-------+-----+----------+-------+----------+---------+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
sentence = "Credit and mortgage account holders must submit their requests"
|
7
|
+
doc = nlp.read(sentence)
|
8
|
+
|
9
|
+
headings = ["text", "pos", "dep", "head text"]
|
10
|
+
rows = []
|
11
|
+
|
12
|
+
doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
|
13
|
+
|
14
|
+
doc.each do |token|
|
15
|
+
rows << [token.text, token.pos_, token.dep_, token.head.text]
|
16
|
+
end
|
17
|
+
|
18
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
19
|
+
puts table
|
20
|
+
|
21
|
+
# +-------------------------------------+------+-------+-----------+
|
22
|
+
# | text | pos | dep | head text |
|
23
|
+
# +-------------------------------------+------+-------+-----------+
|
24
|
+
# | Credit and mortgage account holders | NOUN | nsubj | submit |
|
25
|
+
# | must | AUX | aux | submit |
|
26
|
+
# | submit | VERB | ROOT | submit |
|
27
|
+
# | their | PRON | poss | requests |
|
28
|
+
# | requests | NOUN | dobj | submit |
|
29
|
+
# +-------------------------------------+------+-------+-----------+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
sentence = "I live in New York"
|
7
|
+
doc = nlp.read(sentence)
|
8
|
+
|
9
|
+
puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
|
10
|
+
|
11
|
+
doc.retokenize(3, 4)
|
12
|
+
|
13
|
+
puts "After: " + doc.tokens.collect{|t| t}.join(", ")
|
14
|
+
|
15
|
+
# Before: I, live, in, New, York
|
16
|
+
# After: I, live, in, New York
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
doc = nlp.read("Where are you?")
|
7
|
+
|
8
|
+
puts "Morph features of the third word: " + doc[2].morph.to_s
|
9
|
+
puts "POS of the third word: " + doc[2].pos_.to_s
|
10
|
+
|
11
|
+
# Morph features of the third word: Case=Nom|Person=2|PronType=Prs
|
12
|
+
# POS of the third word: PRON
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
|
3
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
4
|
+
|
5
|
+
doc = nlp.read("This is a sentence. This is another sentence.")
|
6
|
+
|
7
|
+
|
8
|
+
puts "doc has annotation SENT_START: " + doc.has_annotation("SENT_START").to_s
|
9
|
+
|
10
|
+
doc.sents.each do |sent|
|
11
|
+
puts sent.text
|
12
|
+
end
|
13
|
+
|
14
|
+
# doc has annotation SENT_START: true
|
15
|
+
# This is a sentence.
|
16
|
+
# This is another sentence.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
|
+
doc2 = nlp.read("Fast food tastes very good.")
|
7
|
+
|
8
|
+
puts "Doc 1: " + doc1
|
9
|
+
puts "Doc 2: " + doc2
|
10
|
+
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
|
+
|
12
|
+
# Doc 1: I like salty fries and hamburgers.
|
13
|
+
# Doc 2: Fast food tastes very good.
|
14
|
+
# Similarity: 0.7687607012190486
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
|
+
doc2 = nlp.read("Fast food tastes very good.")
|
7
|
+
|
8
|
+
puts "Doc 1: " + doc1
|
9
|
+
puts "Doc 2: " + doc2
|
10
|
+
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
|
+
|
12
|
+
span1 = doc1.span(2, 2) # salty fries
|
13
|
+
span2 = doc1.span(5 .. 5) # hamberger
|
14
|
+
puts "Span 1: " + span1.text
|
15
|
+
puts "Span 2: " + span2.text
|
16
|
+
puts "Similarity: #{span1.similarity(span2)}"
|
17
|
+
|
18
|
+
# Doc 1: I like salty fries and hamburgers.
|
19
|
+
# Doc 2: Fast food tastes very good.
|
20
|
+
# Similarity: 0.7687607012190486
|
21
|
+
# Span 1: salty fries
|
22
|
+
# Span 2: hamburgers
|
23
|
+
# Similarity: 0.6949787735939026
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
doc = nlp.read("gimme that")
|
7
|
+
|
8
|
+
puts doc.tokens.join(" ")
|
9
|
+
|
10
|
+
# Add special case rule
|
11
|
+
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
|
12
|
+
tokenizer = nlp.tokenizer
|
13
|
+
tokenizer.add_special_case("gimme", special_case)
|
14
|
+
|
15
|
+
# Check new tokenization
|
16
|
+
puts nlp.read("gimme that").tokens.join(" ")
|
17
|
+
|
18
|
+
# gimme that
|
19
|
+
# gim me that
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
7
|
+
|
8
|
+
headings = [1,2,3,4,5,6,7,8,9,10,11]
|
9
|
+
row = []
|
10
|
+
|
11
|
+
doc.each do |token|
|
12
|
+
row << token.text
|
13
|
+
end
|
14
|
+
|
15
|
+
table = Terminal::Table.new rows: [row], headings: headings
|
16
|
+
puts table
|
17
|
+
|
18
|
+
# +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
|
19
|
+
# | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
|
20
|
+
# +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
|
21
|
+
# | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
|
22
|
+
# +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
|
23
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
matcher = nlp.matcher
|
6
|
+
matcher.add("US_PRESIDENT", [[{LOWER: "barack"}, {LOWER: "obama"}]])
|
7
|
+
doc = nlp.read("Barack Obama was the 44th president of the United States")
|
8
|
+
|
9
|
+
matches = matcher.match(doc)
|
10
|
+
|
11
|
+
matches.each do |match|
|
12
|
+
span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
|
13
|
+
puts span.text + " / " + span.label_
|
14
|
+
end
|
15
|
+
|
16
|
+
# Barack Obama / US_PRESIDENT
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
|
3
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
4
|
+
|
5
|
+
pattern = [[{LOWER: "hello"}, {IS_PUNCT: true}, {LOWER: "world"}]]
|
6
|
+
|
7
|
+
matcher = nlp.matcher
|
8
|
+
matcher.add("HelloWorld", pattern)
|
9
|
+
|
10
|
+
doc = nlp.read("Hello, world! Hello world!")
|
11
|
+
matches = matcher.match(doc)
|
12
|
+
|
13
|
+
matches.each do | match |
|
14
|
+
string_id = nlp.vocab_string_lookup(match[:match_id])
|
15
|
+
span = doc.span(match[:start_index]..match[:end_index])
|
16
|
+
puts "#{string_id}, #{span.text}"
|
17
|
+
end
|
18
|
+
|
19
|
+
# HelloWorld, Hello, world
|
data/lib/ruby-spacy.rb
ADDED
@@ -0,0 +1,567 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "ruby-spacy/version"
|
4
|
+
require 'enumerator'
|
5
|
+
require 'strscan'
|
6
|
+
require 'pycall/import'
|
7
|
+
require 'numpy'
|
8
|
+
include PyCall::Import
|
9
|
+
|
10
|
+
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
|
+
module Spacy
|
12
|
+
# A utility module method to convert Python's generator object to a Ruby array,
|
13
|
+
# mainly used on the items inside the array returned from dependency-related methods
|
14
|
+
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
15
|
+
def self.generator_to_array(py_generator)
|
16
|
+
PyCall::List.(py_generator)
|
17
|
+
end
|
18
|
+
|
19
|
+
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
20
|
+
class Span
|
21
|
+
|
22
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
23
|
+
attr_reader :spacy_span_id
|
24
|
+
|
25
|
+
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
26
|
+
attr_reader :py_span
|
27
|
+
|
28
|
+
# @return [Doc] the document to which the span belongs
|
29
|
+
attr_reader :doc
|
30
|
+
|
31
|
+
include Enumerable
|
32
|
+
|
33
|
+
alias_method :length, :count
|
34
|
+
alias_method :len, :count
|
35
|
+
alias_method :size, :count
|
36
|
+
|
37
|
+
# It is recommended to use {Doc#span} method to create a span. If you need to
|
38
|
+
# create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
|
39
|
+
# @param doc [Doc] the document to which this span belongs to
|
40
|
+
# @param start_index [Integer] the index of the item starting the span inside a doc
|
41
|
+
# @param end_index [Integer] the index of the item ending the span inside a doc
|
42
|
+
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
43
|
+
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
44
|
+
@doc = doc
|
45
|
+
@spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
|
46
|
+
if py_span
|
47
|
+
@py_span = py_span
|
48
|
+
else
|
49
|
+
options = PyCall::Dict.(options)
|
50
|
+
PyCall.exec("#{@spacy_span_id}_opts = #{options}")
|
51
|
+
PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
|
52
|
+
@py_span = PyCall.eval(@spacy_span_id)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns an array of tokens contained in the span.
|
57
|
+
# @return [Array<Token>]
|
58
|
+
def tokens
|
59
|
+
results = []
|
60
|
+
PyCall::List.(@py_span).each do |py_token|
|
61
|
+
results << Token.new(py_token)
|
62
|
+
end
|
63
|
+
results
|
64
|
+
end
|
65
|
+
|
66
|
+
# Iterates over the elements in the span yielding a token instance.
|
67
|
+
def each
|
68
|
+
PyCall::List.(@py_span).each do |py_token|
|
69
|
+
yield Token.new(py_token)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns an array of spans of noun chunks.
|
74
|
+
# @return [Array<Span>]
|
75
|
+
def noun_chunks
|
76
|
+
chunk_array = []
|
77
|
+
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
78
|
+
py_chunks.each do |py_span|
|
79
|
+
chunk_array << Spacy::Span.new(@doc, py_span: py_span)
|
80
|
+
end
|
81
|
+
chunk_array
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns an array of spans that represents sentences.
|
85
|
+
# @return [Array<Span>]
|
86
|
+
def sents
|
87
|
+
sentence_array = []
|
88
|
+
py_sentences = PyCall::List.(@py_span.sents)
|
89
|
+
py_sentences.each do |py_span|
|
90
|
+
sentence_array << Spacy::Span.new(@doc, py_span: py_span)
|
91
|
+
end
|
92
|
+
sentence_array
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns an array of spans that represents named entities.
|
96
|
+
# @return [Array<Span>]
|
97
|
+
def ents
|
98
|
+
ent_array = []
|
99
|
+
PyCall::List.(@py_span.ents).each do |py_span|
|
100
|
+
# ent_array << ent
|
101
|
+
ent_array << Spacy::Span.new(@doc, py_span: py_span)
|
102
|
+
end
|
103
|
+
ent_array
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns a span that represents the sentence that the given span is part of.
|
107
|
+
# @return [Span]
|
108
|
+
def sent
|
109
|
+
py_span =@py_span.sent
|
110
|
+
return Spacy::Span.new(@doc, py_span: py_span)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
|
114
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
115
|
+
def [](range)
|
116
|
+
if range.is_a?(Range)
|
117
|
+
py_span = @py_span[range]
|
118
|
+
return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
119
|
+
else
|
120
|
+
return Spacy::Token.new(@py_span[range])
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a semantic similarity estimate.
|
125
|
+
# @param other [Span] the other span to which a similarity estimation is conducted
|
126
|
+
# @return [Float]
|
127
|
+
def similarity(other)
|
128
|
+
PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
|
129
|
+
end
|
130
|
+
|
131
|
+
# Creates a document instance
|
132
|
+
# @return [Doc]
|
133
|
+
def as_doc
|
134
|
+
Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns Tokens conjugated to the root of the span.
|
138
|
+
# @return [Array<Token>] an array of tokens
|
139
|
+
def conjuncts
|
140
|
+
conjunct_array = []
|
141
|
+
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
142
|
+
conjunct_array << Spacy::Token.new(py_conjunct)
|
143
|
+
end
|
144
|
+
conjunct_array
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns Tokens that are to the left of the span, whose heads are within the span.
|
148
|
+
# @return [Array<Token>] an array of tokens
|
149
|
+
def lefts
|
150
|
+
left_array = []
|
151
|
+
PyCall::List.(@py_span.lefts).each do |py_left|
|
152
|
+
left_array << Spacy::Token.new(py_left)
|
153
|
+
end
|
154
|
+
left_array
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
158
|
+
# @return [Array<Token>] an array of Tokens
|
159
|
+
def rights
|
160
|
+
right_array = []
|
161
|
+
PyCall::List.(@py_span.rights).each do |py_right|
|
162
|
+
right_array << Spacy::Token.new(py_right)
|
163
|
+
end
|
164
|
+
right_array
|
165
|
+
end
|
166
|
+
|
167
|
+
# Returns Tokens that are within the span and tokens that descend from them.
|
168
|
+
# @return [Array<Token>] an array of tokens
|
169
|
+
def subtree
|
170
|
+
subtree_array = []
|
171
|
+
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
172
|
+
subtree_array << Spacy::Token.new(py_subtree)
|
173
|
+
end
|
174
|
+
subtree_array
|
175
|
+
end
|
176
|
+
|
177
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
178
|
+
def method_missing(name, *args)
|
179
|
+
@py_span.send(name, *args)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
|
184
|
+
class Token
|
185
|
+
|
186
|
+
# @return [Object] a Python `Token` instance accessible via `PyCall`
|
187
|
+
attr_reader :py_token
|
188
|
+
|
189
|
+
# @return [String] a string representing the token
|
190
|
+
attr_reader :text
|
191
|
+
|
192
|
+
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
|
193
|
+
# @param py_token [Object] Python `Token` object
|
194
|
+
def initialize(py_token)
|
195
|
+
@py_token = py_token
|
196
|
+
@text = @py_token.text
|
197
|
+
end
|
198
|
+
|
199
|
+
# Returns the token in question and the tokens that descend from it.
|
200
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
201
|
+
def subtree
|
202
|
+
descendant_array = []
|
203
|
+
PyCall::List.(@py_token.subtree).each do |descendant|
|
204
|
+
descendant_array << descendant
|
205
|
+
end
|
206
|
+
descendant_array
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns the token's ancestors.
|
210
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
211
|
+
def ancestors
|
212
|
+
ancestor_array = []
|
213
|
+
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
214
|
+
ancestor_array << ancestor
|
215
|
+
end
|
216
|
+
ancestor_array
|
217
|
+
end
|
218
|
+
|
219
|
+
# Returns a sequence of the token's immediate syntactic children.
|
220
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
221
|
+
def children
|
222
|
+
child_array = []
|
223
|
+
PyCall::List.(@py_token.children).each do |child|
|
224
|
+
child_array << child
|
225
|
+
end
|
226
|
+
child_array
|
227
|
+
end
|
228
|
+
|
229
|
+
# The leftward immediate children of the word in the syntactic dependency parse.
|
230
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
231
|
+
def lefts
|
232
|
+
token_array = []
|
233
|
+
PyCall::List.(@py_token.lefts).each do |token|
|
234
|
+
token_array << token
|
235
|
+
end
|
236
|
+
token_array
|
237
|
+
end
|
238
|
+
|
239
|
+
# The rightward immediate children of the word in the syntactic dependency parse.
|
240
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
241
|
+
def rights
|
242
|
+
token_array = []
|
243
|
+
PyCall::List.(@py_token.rights).each do |token|
|
244
|
+
token_array << token
|
245
|
+
end
|
246
|
+
token_array
|
247
|
+
end
|
248
|
+
|
249
|
+
# String representation of the token.
|
250
|
+
# @return [String]
|
251
|
+
def to_s
|
252
|
+
@text
|
253
|
+
end
|
254
|
+
|
255
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
256
|
+
def method_missing(name, *args)
|
257
|
+
@py_token.send(name, *args)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
262
|
+
class Doc
|
263
|
+
|
264
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
265
|
+
attr_reader :spacy_nlp_id
|
266
|
+
|
267
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
268
|
+
attr_reader :spacy_doc_id
|
269
|
+
|
270
|
+
# @return [Object] a Python `Doc` instance accessible via `PyCall`
|
271
|
+
attr_reader :py_doc
|
272
|
+
|
273
|
+
# @return [String] a text string of the document
|
274
|
+
attr_reader :text
|
275
|
+
|
276
|
+
include Enumerable
|
277
|
+
|
278
|
+
alias_method :length, :count
|
279
|
+
alias_method :len, :count
|
280
|
+
alias_method :size, :count
|
281
|
+
|
282
|
+
# Creates a new instance of {Doc}.
|
283
|
+
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
284
|
+
# @param text [String] The text string to be analyzed
|
285
|
+
def initialize(nlp_id, text)
|
286
|
+
@text = text
|
287
|
+
@spacy_nlp_id = nlp_id
|
288
|
+
@spacy_doc_id = "doc_#{text.object_id}"
|
289
|
+
quoted = text.gsub('"', '\"')
|
290
|
+
PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
|
291
|
+
PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
|
292
|
+
@py_doc = PyCall.eval(@spacy_doc_id)
|
293
|
+
end
|
294
|
+
|
295
|
+
|
296
|
+
# Retokenizes the text merging a span into a single token.
|
297
|
+
# @param start_index [Integer] The start position of the span to be retokenized in the document
|
298
|
+
# @param end_index [Integer] The end position of the span to be retokenized in the document
|
299
|
+
# @param attributes [Hash] Attributes to set on the merged token
|
300
|
+
def retokenize(start_index, end_index, attributes = {})
|
301
|
+
py_attrs = PyCall::Dict.(attributes)
|
302
|
+
PyCall.exec(<<PY)
|
303
|
+
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
304
|
+
retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
|
305
|
+
PY
|
306
|
+
@py_doc = PyCall.eval(@spacy_doc_id)
|
307
|
+
end
|
308
|
+
|
309
|
+
# Retokenizes the text splitting the specified token.
|
310
|
+
# @param pos_in_doc [Integer] The position of the span to be retokenized in the document
|
311
|
+
# @param split_array [Array<String>] text strings of the split results
|
312
|
+
# @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
|
313
|
+
# @param attributes [Hash] The attributes of the split elements
|
314
|
+
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
315
|
+
py_attrs = PyCall::Dict.(attributes)
|
316
|
+
py_split_array = PyCall::List.(split_array)
|
317
|
+
PyCall.exec(<<PY)
|
318
|
+
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
319
|
+
heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
|
320
|
+
attrs = #{py_attrs}
|
321
|
+
split_array = #{py_split_array}
|
322
|
+
retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
|
323
|
+
PY
|
324
|
+
@py_doc = PyCall.eval(@spacy_doc_id)
|
325
|
+
end
|
326
|
+
|
327
|
+
# String representation of the token.
|
328
|
+
# @return [String]
|
329
|
+
def to_s
|
330
|
+
@text
|
331
|
+
end
|
332
|
+
|
333
|
+
# Returns an array of tokens contained in the doc.
|
334
|
+
# @return [Array<Token>]
|
335
|
+
def tokens
|
336
|
+
results = []
|
337
|
+
PyCall::List.(@py_doc).each do |py_token|
|
338
|
+
results << Token.new(py_token)
|
339
|
+
end
|
340
|
+
results
|
341
|
+
end
|
342
|
+
|
343
|
+
# Iterates over the elements in the doc yielding a token instance.
|
344
|
+
def each
|
345
|
+
PyCall::List.(@py_doc).each do |py_token|
|
346
|
+
yield Token.new(py_token)
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
# Returns a span of the specified range within the doc.
|
351
|
+
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
352
|
+
# @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
|
353
|
+
# @param optional_size [Integer] An integer representing the size of the span
|
354
|
+
# @return [Span]
|
355
|
+
def span(range_or_start, optional_size = nil)
|
356
|
+
if optional_size
|
357
|
+
start_index = range_or_start
|
358
|
+
temp = tokens[start_index ... start_index + optional_size]
|
359
|
+
else
|
360
|
+
start_index = range_or_start.first
|
361
|
+
range = range_or_start
|
362
|
+
temp = tokens[range]
|
363
|
+
end
|
364
|
+
|
365
|
+
end_index = start_index + temp.size - 1
|
366
|
+
|
367
|
+
Span.new(self, start_index: start_index, end_index: end_index)
|
368
|
+
end
|
369
|
+
|
370
|
+
# Returns an array of spans representing noun chunks.
|
371
|
+
# @return [Array<Span>]
|
372
|
+
def noun_chunks
|
373
|
+
chunk_array = []
|
374
|
+
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
375
|
+
py_chunks.each do |py_chunk|
|
376
|
+
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
377
|
+
end
|
378
|
+
chunk_array
|
379
|
+
end
|
380
|
+
|
381
|
+
# Returns an array of spans representing sentences.
|
382
|
+
# @return [Array<Span>]
|
383
|
+
def sents
|
384
|
+
sentence_array = []
|
385
|
+
py_sentences = PyCall::List.(@py_doc.sents)
|
386
|
+
py_sentences.each do |py_sent|
|
387
|
+
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
388
|
+
end
|
389
|
+
sentence_array
|
390
|
+
end
|
391
|
+
|
392
|
+
# Returns an array of spans representing named entities.
|
393
|
+
# @return [Array<Span>]
|
394
|
+
def ents
|
395
|
+
# so that ents canbe "each"-ed in Ruby
|
396
|
+
ent_array = []
|
397
|
+
PyCall::List.(@py_doc.ents).each do |ent|
|
398
|
+
ent_array << ent
|
399
|
+
end
|
400
|
+
ent_array
|
401
|
+
end
|
402
|
+
|
403
|
+
# Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
|
404
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
405
|
+
def [](range)
|
406
|
+
if range.is_a?(Range)
|
407
|
+
py_span = @py_doc[range]
|
408
|
+
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
409
|
+
else
|
410
|
+
return Token.new(@py_doc[range])
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
# Returns a semantic similarity estimate.
|
415
|
+
# @param other [Doc] the other doc to which a similarity estimation is made
|
416
|
+
# @return [Float]
|
417
|
+
def similarity(other)
|
418
|
+
PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
|
419
|
+
end
|
420
|
+
|
421
|
+
# Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
|
422
|
+
# @param style [String] Either `dep` or `ent`
|
423
|
+
# @param compact [Boolean] Only relevant to the `dep' style
|
424
|
+
# @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
|
425
|
+
def displacy(style: "dep", compact: false)
|
426
|
+
PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
|
427
|
+
end
|
428
|
+
|
429
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
430
|
+
def method_missing(name, *args)
|
431
|
+
@py_doc.send(name, *args)
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
436
|
+
class Matcher
|
437
|
+
|
438
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
439
|
+
attr_reader :spacy_matcher_id
|
440
|
+
|
441
|
+
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
442
|
+
attr_reader :py_matcher
|
443
|
+
|
444
|
+
# Creates a {Matcher} instance
|
445
|
+
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
446
|
+
def initialize(nlp_id)
|
447
|
+
@spacy_matcher_id = "doc_#{nlp_id}_matcher"
|
448
|
+
PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
|
449
|
+
@py_matcher = PyCall.eval(@spacy_matcher_id)
|
450
|
+
end
|
451
|
+
|
452
|
+
# Adds a label string and a text pattern.
|
453
|
+
# @param text [String] a label string given to the pattern
|
454
|
+
# @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
|
455
|
+
def add(text, pattern)
|
456
|
+
@py_matcher.add(text, pattern)
|
457
|
+
end
|
458
|
+
|
459
|
+
# Execute the match.
|
460
|
+
# @param doc [Doc] An {Doc} instance
|
461
|
+
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
|
462
|
+
def match(doc)
|
463
|
+
str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
|
464
|
+
s = StringScanner.new(str_results[1..-2])
|
465
|
+
results = []
|
466
|
+
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
467
|
+
next unless s.matched
|
468
|
+
triple = s.matched.split(", ")
|
469
|
+
match_id = triple[0].to_i
|
470
|
+
start_index = triple[1].to_i
|
471
|
+
end_index = triple[2].to_i - 1
|
472
|
+
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
473
|
+
end
|
474
|
+
results
|
475
|
+
end
|
476
|
+
end
|
477
|
+
|
478
|
+
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
479
|
+
class Language
|
480
|
+
|
481
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
482
|
+
attr_reader :spacy_nlp_id
|
483
|
+
|
484
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
485
|
+
attr_reader :py_nlp
|
486
|
+
|
487
|
+
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
488
|
+
# @param model [String] A language model installed in the system
|
489
|
+
def initialize(model = "en_core_web_sm")
|
490
|
+
@spacy_nlp_id = "nlp_#{model.object_id}"
|
491
|
+
PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
|
492
|
+
PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
|
493
|
+
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
494
|
+
end
|
495
|
+
|
496
|
+
# Reads and analyze the given text.
|
497
|
+
# @param text [String] A text to be read and analyzed
|
498
|
+
def read(text)
|
499
|
+
Doc.new(@spacy_nlp_id, text)
|
500
|
+
end
|
501
|
+
|
502
|
+
# Generates a matcher for the current language model.
|
503
|
+
# @return [Matcher]
|
504
|
+
def matcher
|
505
|
+
Matcher.new(@spacy_nlp_id)
|
506
|
+
end
|
507
|
+
|
508
|
+
# A utility method to lookup a vocabulary item of the given id.
|
509
|
+
# @param id [Integer] A vocabulary id
|
510
|
+
# @return [Object] A Python `Lexeme` object
|
511
|
+
def vocab_string_lookup(id)
|
512
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
513
|
+
end
|
514
|
+
|
515
|
+
# A utility method to list pipeline components.
|
516
|
+
# @return [Array<String>] An array of text strings representing pipeline components
|
517
|
+
def pipe_names
|
518
|
+
pipe_array = []
|
519
|
+
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
520
|
+
pipe_array << pipe
|
521
|
+
end
|
522
|
+
pipe_array
|
523
|
+
end
|
524
|
+
|
525
|
+
# A utility method to get the tokenizer Python object.
|
526
|
+
# @return [Object] Python `Tokenizer` object
|
527
|
+
def tokenizer
|
528
|
+
return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
|
529
|
+
end
|
530
|
+
|
531
|
+
# A utility method to get a Python `Lexeme` object.
|
532
|
+
# @param text [String] A text string representing a lexeme
|
533
|
+
# @return [Object] Python `Tokenizer` object
|
534
|
+
def get_lexeme(text)
|
535
|
+
text = text.gsub("'", "\'")
|
536
|
+
py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
|
537
|
+
return py_lexeme
|
538
|
+
end
|
539
|
+
|
540
|
+
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
541
|
+
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
542
|
+
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
543
|
+
def most_similar(vector, n)
|
544
|
+
vec_array = Numpy.asarray([vector])
|
545
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
546
|
+
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
|
547
|
+
keys = key_texts.map{|kt| kt[0]}
|
548
|
+
texts = key_texts.map{|kt| kt[1]}
|
549
|
+
best_rows = PyCall::List.(py_result[1])[0]
|
550
|
+
scores = PyCall::List.(py_result[2])[0]
|
551
|
+
|
552
|
+
results = []
|
553
|
+
n.times do |i|
|
554
|
+
results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
|
555
|
+
end
|
556
|
+
|
557
|
+
results
|
558
|
+
end
|
559
|
+
|
560
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
|
561
|
+
def method_missing(name, *args)
|
562
|
+
@py_nlp.send(name, *args)
|
563
|
+
end
|
564
|
+
end
|
565
|
+
|
566
|
+
end
|
567
|
+
|