ruby-spacy 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +58 -0
- data/.yardopts +2 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +39 -0
- data/LICENSE.txt +21 -0
- data/README.md +498 -0
- data/Rakefile +12 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/examples/get_started/lexeme.rb +24 -0
- data/examples/get_started/linguistic_annotations.rb +32 -0
- data/examples/get_started/most_similar.rb +46 -0
- data/examples/get_started/named_entities.rb +24 -0
- data/examples/get_started/outputs/test_dep.svg +84 -0
- data/examples/get_started/outputs/test_dep_compact.svg +84 -0
- data/examples/get_started/outputs/test_ent.html +11 -0
- data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
- data/examples/get_started/similarity.rb +13 -0
- data/examples/get_started/tokenization.rb +22 -0
- data/examples/get_started/visualizing_dependencies.rb +14 -0
- data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
- data/examples/get_started/visualizing_named_entities.rb +12 -0
- data/examples/get_started/vocab.rb +10 -0
- data/examples/get_started/word_vectors.rb +24 -0
- data/examples/japanese/ancestors.rb +44 -0
- data/examples/japanese/entity_annotations_and_labels.rb +45 -0
- data/examples/japanese/information_extraction.rb +27 -0
- data/examples/japanese/lemmatization.rb +32 -0
- data/examples/japanese/most_similar.rb +46 -0
- data/examples/japanese/named_entity_recognition.rb +27 -0
- data/examples/japanese/navigating_parse_tree.rb +34 -0
- data/examples/japanese/noun_chunks.rb +23 -0
- data/examples/japanese/outputs/test_dep.svg +149 -0
- data/examples/japanese/outputs/test_ent.html +16 -0
- data/examples/japanese/pos_tagging.rb +34 -0
- data/examples/japanese/sentence_segmentation.rb +16 -0
- data/examples/japanese/similarity.rb +12 -0
- data/examples/japanese/tokenization.rb +38 -0
- data/examples/japanese/visualizing_dependencies.rb +13 -0
- data/examples/japanese/visualizing_named_entities.rb +14 -0
- data/examples/linguistic_features/ancestors.rb +41 -0
- data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
- data/examples/linguistic_features/information_extraction.rb +36 -0
- data/examples/linguistic_features/iterating_children.rb +24 -0
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
- data/examples/linguistic_features/lemmatization.rb +31 -0
- data/examples/linguistic_features/morphology.rb +17 -0
- data/examples/linguistic_features/named_entity_recognition.rb +25 -0
- data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
- data/examples/linguistic_features/noun_chunks.rb +27 -0
- data/examples/linguistic_features/outputs/test_ent.html +11 -0
- data/examples/linguistic_features/pos_tagging.rb +31 -0
- data/examples/linguistic_features/retokenize_1.rb +29 -0
- data/examples/linguistic_features/retokenize_2.rb +16 -0
- data/examples/linguistic_features/rule_based_morphology.rb +12 -0
- data/examples/linguistic_features/sentence_segmentation.rb +16 -0
- data/examples/linguistic_features/similarity.rb +14 -0
- data/examples/linguistic_features/similarity_between_spans.rb +23 -0
- data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
- data/examples/linguistic_features/tokenization.rb +23 -0
- data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
- data/examples/rule_based_matching/matcher.rb +19 -0
- data/lib/ruby-spacy.rb +567 -0
- data/lib/ruby-spacy/version.rb +6 -0
- data/ruby-spacy.gemspec +42 -0
- metadata +157 -0
@@ -0,0 +1,27 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
lemmatizer = nlp.get_pipe("lemmatizer")
|
7
|
+
puts "Lemmatizer mode: " + lemmatizer.mode
|
8
|
+
|
9
|
+
doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
|
10
|
+
|
11
|
+
headings = ["text", "root.text", "root.dep", "root.head.text"]
|
12
|
+
rows = []
|
13
|
+
|
14
|
+
doc.noun_chunks.each do |chunk|
|
15
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
|
16
|
+
end
|
17
|
+
|
18
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
19
|
+
puts table
|
20
|
+
|
21
|
+
# +---------------------+---------------+----------+----------------+
|
22
|
+
# | text | root.text | root.dep | root.head.text |
|
23
|
+
# +---------------------+---------------+----------+----------------+
|
24
|
+
# | Autonomous cars | cars | nsubj | shift |
|
25
|
+
# | insurance liability | liability | dobj | shift |
|
26
|
+
# | manufacturers | manufacturers | pobj | toward |
|
27
|
+
# +---------------------+---------------+----------+----------------+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<div class="entities" style="line-height: 2.5; direction: ltr">When
|
2
|
+
<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
3
|
+
Sebastian Thrun
|
4
|
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
|
5
|
+
</mark>
|
6
|
+
started working on self-driving cars at Google in
|
7
|
+
<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
8
|
+
2007
|
9
|
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>
|
10
|
+
</mark>
|
11
|
+
, few people outside of the company took him seriously.</div>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
6
|
+
|
7
|
+
headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
|
8
|
+
rows = []
|
9
|
+
|
10
|
+
doc.each do |token|
|
11
|
+
rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
|
12
|
+
end
|
13
|
+
|
14
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
15
|
+
puts table
|
16
|
+
|
17
|
+
# +---------+---------+-------+-----+----------+-------+----------+---------+
|
18
|
+
# | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
|
19
|
+
# +---------+---------+-------+-----+----------+-------+----------+---------+
|
20
|
+
# | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
|
21
|
+
# | is | be | AUX | VBZ | aux | xx | true | true |
|
22
|
+
# | looking | look | VERB | VBG | ROOT | xxxx | true | false |
|
23
|
+
# | at | at | ADP | IN | prep | xx | true | true |
|
24
|
+
# | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
|
25
|
+
# | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
|
26
|
+
# | startup | startup | NOUN | NN | advcl | xxxx | true | false |
|
27
|
+
# | for | for | ADP | IN | prep | xxx | true | true |
|
28
|
+
# | $ | $ | SYM | $ | quantmod | $ | false | false |
|
29
|
+
# | 1 | 1 | NUM | CD | compound | d | false | false |
|
30
|
+
# | billion | billion | NUM | CD | pobj | xxxx | true | false |
|
31
|
+
# +---------+---------+-------+-----+----------+-------+----------+---------+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
sentence = "Credit and mortgage account holders must submit their requests"
|
7
|
+
doc = nlp.read(sentence)
|
8
|
+
|
9
|
+
headings = ["text", "pos", "dep", "head text"]
|
10
|
+
rows = []
|
11
|
+
|
12
|
+
doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
|
13
|
+
|
14
|
+
doc.each do |token|
|
15
|
+
rows << [token.text, token.pos_, token.dep_, token.head.text]
|
16
|
+
end
|
17
|
+
|
18
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
19
|
+
puts table
|
20
|
+
|
21
|
+
# +-------------------------------------+------+-------+-----------+
|
22
|
+
# | text | pos | dep | head text |
|
23
|
+
# +-------------------------------------+------+-------+-----------+
|
24
|
+
# | Credit and mortgage account holders | NOUN | nsubj | submit |
|
25
|
+
# | must | AUX | aux | submit |
|
26
|
+
# | submit | VERB | ROOT | submit |
|
27
|
+
# | their | PRON | poss | requests |
|
28
|
+
# | requests | NOUN | dobj | submit |
|
29
|
+
# +-------------------------------------+------+-------+-----------+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
sentence = "I live in New York"
|
7
|
+
doc = nlp.read(sentence)
|
8
|
+
|
9
|
+
puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
|
10
|
+
|
11
|
+
doc.retokenize(3, 4)
|
12
|
+
|
13
|
+
puts "After: " + doc.tokens.collect{|t| t}.join(", ")
|
14
|
+
|
15
|
+
# Before: I, live, in, New, York
|
16
|
+
# After: I, live, in, New York
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
doc = nlp.read("Where are you?")
|
7
|
+
|
8
|
+
puts "Morph features of the third word: " + doc[2].morph.to_s
|
9
|
+
puts "POS of the third word: " + doc[2].pos_.to_s
|
10
|
+
|
11
|
+
# Morph features of the third word: Case=Nom|Person=2|PronType=Prs
|
12
|
+
# POS of the third word: PRON
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
|
3
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
4
|
+
|
5
|
+
doc = nlp.read("This is a sentence. This is another sentence.")
|
6
|
+
|
7
|
+
|
8
|
+
puts "doc has annotation SENT_START: " + doc.has_annotation("SENT_START").to_s
|
9
|
+
|
10
|
+
doc.sents.each do |sent|
|
11
|
+
puts sent.text
|
12
|
+
end
|
13
|
+
|
14
|
+
# doc has annotation SENT_START: true
|
15
|
+
# This is a sentence.
|
16
|
+
# This is another sentence.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
|
+
doc2 = nlp.read("Fast food tastes very good.")
|
7
|
+
|
8
|
+
puts "Doc 1: " + doc1
|
9
|
+
puts "Doc 2: " + doc2
|
10
|
+
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
|
+
|
12
|
+
# Doc 1: I like salty fries and hamburgers.
|
13
|
+
# Doc 2: Fast food tastes very good.
|
14
|
+
# Similarity: 0.7687607012190486
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
doc1 = nlp.read("I like salty fries and hamburgers.")
|
6
|
+
doc2 = nlp.read("Fast food tastes very good.")
|
7
|
+
|
8
|
+
puts "Doc 1: " + doc1
|
9
|
+
puts "Doc 2: " + doc2
|
10
|
+
puts "Similarity: #{doc1.similarity(doc2)}"
|
11
|
+
|
12
|
+
span1 = doc1.span(2, 2) # salty fries
|
13
|
+
span2 = doc1.span(5 .. 5) # hamberger
|
14
|
+
puts "Span 1: " + span1.text
|
15
|
+
puts "Span 2: " + span2.text
|
16
|
+
puts "Similarity: #{span1.similarity(span2)}"
|
17
|
+
|
18
|
+
# Doc 1: I like salty fries and hamburgers.
|
19
|
+
# Doc 2: Fast food tastes very good.
|
20
|
+
# Similarity: 0.7687607012190486
|
21
|
+
# Span 1: salty fries
|
22
|
+
# Span 2: hamburgers
|
23
|
+
# Similarity: 0.6949787735939026
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
doc = nlp.read("gimme that")
|
7
|
+
|
8
|
+
puts doc.tokens.join(" ")
|
9
|
+
|
10
|
+
# Add special case rule
|
11
|
+
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
|
12
|
+
tokenizer = nlp.tokenizer
|
13
|
+
tokenizer.add_special_case("gimme", special_case)
|
14
|
+
|
15
|
+
# Check new tokenization
|
16
|
+
puts nlp.read("gimme that").tokens.join(" ")
|
17
|
+
|
18
|
+
# gimme that
|
19
|
+
# gim me that
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
5
|
+
|
6
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
7
|
+
|
8
|
+
headings = [1,2,3,4,5,6,7,8,9,10,11]
|
9
|
+
row = []
|
10
|
+
|
11
|
+
doc.each do |token|
|
12
|
+
row << token.text
|
13
|
+
end
|
14
|
+
|
15
|
+
table = Terminal::Table.new rows: [row], headings: headings
|
16
|
+
puts table
|
17
|
+
|
18
|
+
# +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
|
19
|
+
# | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
|
20
|
+
# +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
|
21
|
+
# | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
|
22
|
+
# +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
|
23
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
matcher = nlp.matcher
|
6
|
+
matcher.add("US_PRESIDENT", [[{LOWER: "barack"}, {LOWER: "obama"}]])
|
7
|
+
doc = nlp.read("Barack Obama was the 44th president of the United States")
|
8
|
+
|
9
|
+
matches = matcher.match(doc)
|
10
|
+
|
11
|
+
matches.each do |match|
|
12
|
+
span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
|
13
|
+
puts span.text + " / " + span.label_
|
14
|
+
end
|
15
|
+
|
16
|
+
# Barack Obama / US_PRESIDENT
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
|
3
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
4
|
+
|
5
|
+
pattern = [[{LOWER: "hello"}, {IS_PUNCT: true}, {LOWER: "world"}]]
|
6
|
+
|
7
|
+
matcher = nlp.matcher
|
8
|
+
matcher.add("HelloWorld", pattern)
|
9
|
+
|
10
|
+
doc = nlp.read("Hello, world! Hello world!")
|
11
|
+
matches = matcher.match(doc)
|
12
|
+
|
13
|
+
matches.each do | match |
|
14
|
+
string_id = nlp.vocab_string_lookup(match[:match_id])
|
15
|
+
span = doc.span(match[:start_index]..match[:end_index])
|
16
|
+
puts "#{string_id}, #{span.text}"
|
17
|
+
end
|
18
|
+
|
19
|
+
# HelloWorld, Hello, world
|
data/lib/ruby-spacy.rb
ADDED
@@ -0,0 +1,567 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "ruby-spacy/version"
|
4
|
+
require 'enumerator'
|
5
|
+
require 'strscan'
|
6
|
+
require 'pycall/import'
|
7
|
+
require 'numpy'
|
8
|
+
include PyCall::Import
|
9
|
+
|
10
|
+
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
|
+
module Spacy
|
12
|
+
# A utility module method to convert Python's generator object to a Ruby array,
|
13
|
+
# mainly used on the items inside the array returned from dependency-related methods
|
14
|
+
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
15
|
+
def self.generator_to_array(py_generator)
|
16
|
+
PyCall::List.(py_generator)
|
17
|
+
end
|
18
|
+
|
19
|
+
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
20
|
+
class Span
|
21
|
+
|
22
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
23
|
+
attr_reader :spacy_span_id
|
24
|
+
|
25
|
+
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
26
|
+
attr_reader :py_span
|
27
|
+
|
28
|
+
# @return [Doc] the document to which the span belongs
|
29
|
+
attr_reader :doc
|
30
|
+
|
31
|
+
include Enumerable
|
32
|
+
|
33
|
+
alias_method :length, :count
|
34
|
+
alias_method :len, :count
|
35
|
+
alias_method :size, :count
|
36
|
+
|
37
|
+
# It is recommended to use {Doc#span} method to create a span. If you need to
|
38
|
+
# create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
|
39
|
+
# @param doc [Doc] the document to which this span belongs to
|
40
|
+
# @param start_index [Integer] the index of the item starting the span inside a doc
|
41
|
+
# @param end_index [Integer] the index of the item ending the span inside a doc
|
42
|
+
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
43
|
+
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
44
|
+
@doc = doc
|
45
|
+
@spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
|
46
|
+
if py_span
|
47
|
+
@py_span = py_span
|
48
|
+
else
|
49
|
+
options = PyCall::Dict.(options)
|
50
|
+
PyCall.exec("#{@spacy_span_id}_opts = #{options}")
|
51
|
+
PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
|
52
|
+
@py_span = PyCall.eval(@spacy_span_id)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns an array of tokens contained in the span.
|
57
|
+
# @return [Array<Token>]
|
58
|
+
def tokens
|
59
|
+
results = []
|
60
|
+
PyCall::List.(@py_span).each do |py_token|
|
61
|
+
results << Token.new(py_token)
|
62
|
+
end
|
63
|
+
results
|
64
|
+
end
|
65
|
+
|
66
|
+
# Iterates over the elements in the span yielding a token instance.
|
67
|
+
def each
|
68
|
+
PyCall::List.(@py_span).each do |py_token|
|
69
|
+
yield Token.new(py_token)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns an array of spans of noun chunks.
|
74
|
+
# @return [Array<Span>]
|
75
|
+
def noun_chunks
|
76
|
+
chunk_array = []
|
77
|
+
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
78
|
+
py_chunks.each do |py_span|
|
79
|
+
chunk_array << Spacy::Span.new(@doc, py_span: py_span)
|
80
|
+
end
|
81
|
+
chunk_array
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns an array of spans that represents sentences.
|
85
|
+
# @return [Array<Span>]
|
86
|
+
def sents
|
87
|
+
sentence_array = []
|
88
|
+
py_sentences = PyCall::List.(@py_span.sents)
|
89
|
+
py_sentences.each do |py_span|
|
90
|
+
sentence_array << Spacy::Span.new(@doc, py_span: py_span)
|
91
|
+
end
|
92
|
+
sentence_array
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns an array of spans that represents named entities.
|
96
|
+
# @return [Array<Span>]
|
97
|
+
def ents
|
98
|
+
ent_array = []
|
99
|
+
PyCall::List.(@py_span.ents).each do |py_span|
|
100
|
+
# ent_array << ent
|
101
|
+
ent_array << Spacy::Span.new(@doc, py_span: py_span)
|
102
|
+
end
|
103
|
+
ent_array
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns a span that represents the sentence that the given span is part of.
|
107
|
+
# @return [Span]
|
108
|
+
def sent
|
109
|
+
py_span =@py_span.sent
|
110
|
+
return Spacy::Span.new(@doc, py_span: py_span)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
|
114
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
115
|
+
def [](range)
|
116
|
+
if range.is_a?(Range)
|
117
|
+
py_span = @py_span[range]
|
118
|
+
return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
119
|
+
else
|
120
|
+
return Spacy::Token.new(@py_span[range])
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a semantic similarity estimate.
|
125
|
+
# @param other [Span] the other span to which a similarity estimation is conducted
|
126
|
+
# @return [Float]
|
127
|
+
def similarity(other)
|
128
|
+
PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
|
129
|
+
end
|
130
|
+
|
131
|
+
# Creates a document instance
|
132
|
+
# @return [Doc]
|
133
|
+
def as_doc
|
134
|
+
Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns Tokens conjugated to the root of the span.
|
138
|
+
# @return [Array<Token>] an array of tokens
|
139
|
+
def conjuncts
|
140
|
+
conjunct_array = []
|
141
|
+
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
142
|
+
conjunct_array << Spacy::Token.new(py_conjunct)
|
143
|
+
end
|
144
|
+
conjunct_array
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns Tokens that are to the left of the span, whose heads are within the span.
|
148
|
+
# @return [Array<Token>] an array of tokens
|
149
|
+
def lefts
|
150
|
+
left_array = []
|
151
|
+
PyCall::List.(@py_span.lefts).each do |py_left|
|
152
|
+
left_array << Spacy::Token.new(py_left)
|
153
|
+
end
|
154
|
+
left_array
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
158
|
+
# @return [Array<Token>] an array of Tokens
|
159
|
+
def rights
|
160
|
+
right_array = []
|
161
|
+
PyCall::List.(@py_span.rights).each do |py_right|
|
162
|
+
right_array << Spacy::Token.new(py_right)
|
163
|
+
end
|
164
|
+
right_array
|
165
|
+
end
|
166
|
+
|
167
|
+
# Returns Tokens that are within the span and tokens that descend from them.
|
168
|
+
# @return [Array<Token>] an array of tokens
|
169
|
+
def subtree
|
170
|
+
subtree_array = []
|
171
|
+
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
172
|
+
subtree_array << Spacy::Token.new(py_subtree)
|
173
|
+
end
|
174
|
+
subtree_array
|
175
|
+
end
|
176
|
+
|
177
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
178
|
+
def method_missing(name, *args)
|
179
|
+
@py_span.send(name, *args)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
|
184
|
+
class Token
|
185
|
+
|
186
|
+
# @return [Object] a Python `Token` instance accessible via `PyCall`
|
187
|
+
attr_reader :py_token
|
188
|
+
|
189
|
+
# @return [String] a string representing the token
|
190
|
+
attr_reader :text
|
191
|
+
|
192
|
+
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
|
193
|
+
# @param py_token [Object] Python `Token` object
|
194
|
+
def initialize(py_token)
|
195
|
+
@py_token = py_token
|
196
|
+
@text = @py_token.text
|
197
|
+
end
|
198
|
+
|
199
|
+
# Returns the token in question and the tokens that descend from it.
|
200
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
201
|
+
def subtree
|
202
|
+
descendant_array = []
|
203
|
+
PyCall::List.(@py_token.subtree).each do |descendant|
|
204
|
+
descendant_array << descendant
|
205
|
+
end
|
206
|
+
descendant_array
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns the token's ancestors.
|
210
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
211
|
+
def ancestors
|
212
|
+
ancestor_array = []
|
213
|
+
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
214
|
+
ancestor_array << ancestor
|
215
|
+
end
|
216
|
+
ancestor_array
|
217
|
+
end
|
218
|
+
|
219
|
+
# Returns a sequence of the token's immediate syntactic children.
|
220
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
221
|
+
def children
|
222
|
+
child_array = []
|
223
|
+
PyCall::List.(@py_token.children).each do |child|
|
224
|
+
child_array << child
|
225
|
+
end
|
226
|
+
child_array
|
227
|
+
end
|
228
|
+
|
229
|
+
# The leftward immediate children of the word in the syntactic dependency parse.
|
230
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
231
|
+
def lefts
|
232
|
+
token_array = []
|
233
|
+
PyCall::List.(@py_token.lefts).each do |token|
|
234
|
+
token_array << token
|
235
|
+
end
|
236
|
+
token_array
|
237
|
+
end
|
238
|
+
|
239
|
+
# The rightward immediate children of the word in the syntactic dependency parse.
|
240
|
+
# @return [Array<Object>] an (Ruby) array of Python `Token` objects
|
241
|
+
def rights
|
242
|
+
token_array = []
|
243
|
+
PyCall::List.(@py_token.rights).each do |token|
|
244
|
+
token_array << token
|
245
|
+
end
|
246
|
+
token_array
|
247
|
+
end
|
248
|
+
|
249
|
+
# String representation of the token.
|
250
|
+
# @return [String]
|
251
|
+
def to_s
|
252
|
+
@text
|
253
|
+
end
|
254
|
+
|
255
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
256
|
+
def method_missing(name, *args)
|
257
|
+
@py_token.send(name, *args)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
262
|
+
class Doc
|
263
|
+
|
264
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
265
|
+
attr_reader :spacy_nlp_id
|
266
|
+
|
267
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
268
|
+
attr_reader :spacy_doc_id
|
269
|
+
|
270
|
+
# @return [Object] a Python `Doc` instance accessible via `PyCall`
|
271
|
+
attr_reader :py_doc
|
272
|
+
|
273
|
+
# @return [String] a text string of the document
|
274
|
+
attr_reader :text
|
275
|
+
|
276
|
+
include Enumerable
|
277
|
+
|
278
|
+
alias_method :length, :count
|
279
|
+
alias_method :len, :count
|
280
|
+
alias_method :size, :count
|
281
|
+
|
282
|
+
# Creates a new instance of {Doc}.
|
283
|
+
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
284
|
+
# @param text [String] The text string to be analyzed
|
285
|
+
def initialize(nlp_id, text)
|
286
|
+
@text = text
|
287
|
+
@spacy_nlp_id = nlp_id
|
288
|
+
@spacy_doc_id = "doc_#{text.object_id}"
|
289
|
+
quoted = text.gsub('"', '\"')
|
290
|
+
PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
|
291
|
+
PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
|
292
|
+
@py_doc = PyCall.eval(@spacy_doc_id)
|
293
|
+
end
|
294
|
+
|
295
|
+
|
296
|
+
# Retokenizes the text merging a span into a single token.
|
297
|
+
# @param start_index [Integer] The start position of the span to be retokenized in the document
|
298
|
+
# @param end_index [Integer] The end position of the span to be retokenized in the document
|
299
|
+
# @param attributes [Hash] Attributes to set on the merged token
|
300
|
+
def retokenize(start_index, end_index, attributes = {})
|
301
|
+
py_attrs = PyCall::Dict.(attributes)
|
302
|
+
PyCall.exec(<<PY)
|
303
|
+
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
304
|
+
retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
|
305
|
+
PY
|
306
|
+
@py_doc = PyCall.eval(@spacy_doc_id)
|
307
|
+
end
|
308
|
+
|
309
|
+
# Retokenizes the text splitting the specified token.
|
310
|
+
# @param pos_in_doc [Integer] The position of the span to be retokenized in the document
|
311
|
+
# @param split_array [Array<String>] text strings of the split results
|
312
|
+
# @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
|
313
|
+
# @param attributes [Hash] The attributes of the split elements
|
314
|
+
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
315
|
+
py_attrs = PyCall::Dict.(attributes)
|
316
|
+
py_split_array = PyCall::List.(split_array)
|
317
|
+
PyCall.exec(<<PY)
|
318
|
+
with #{@spacy_doc_id}.retokenize() as retokenizer:
|
319
|
+
heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
|
320
|
+
attrs = #{py_attrs}
|
321
|
+
split_array = #{py_split_array}
|
322
|
+
retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
|
323
|
+
PY
|
324
|
+
@py_doc = PyCall.eval(@spacy_doc_id)
|
325
|
+
end
|
326
|
+
|
327
|
+
# String representation of the token.
|
328
|
+
# @return [String]
|
329
|
+
def to_s
|
330
|
+
@text
|
331
|
+
end
|
332
|
+
|
333
|
+
# Returns an array of tokens contained in the doc.
|
334
|
+
# @return [Array<Token>]
|
335
|
+
def tokens
|
336
|
+
results = []
|
337
|
+
PyCall::List.(@py_doc).each do |py_token|
|
338
|
+
results << Token.new(py_token)
|
339
|
+
end
|
340
|
+
results
|
341
|
+
end
|
342
|
+
|
343
|
+
# Iterates over the elements in the doc yielding a token instance.
|
344
|
+
def each
|
345
|
+
PyCall::List.(@py_doc).each do |py_token|
|
346
|
+
yield Token.new(py_token)
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
# Returns a span of the specified range within the doc.
|
351
|
+
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
352
|
+
# @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
|
353
|
+
# @param optional_size [Integer] An integer representing the size of the span
|
354
|
+
# @return [Span]
|
355
|
+
def span(range_or_start, optional_size = nil)
|
356
|
+
if optional_size
|
357
|
+
start_index = range_or_start
|
358
|
+
temp = tokens[start_index ... start_index + optional_size]
|
359
|
+
else
|
360
|
+
start_index = range_or_start.first
|
361
|
+
range = range_or_start
|
362
|
+
temp = tokens[range]
|
363
|
+
end
|
364
|
+
|
365
|
+
end_index = start_index + temp.size - 1
|
366
|
+
|
367
|
+
Span.new(self, start_index: start_index, end_index: end_index)
|
368
|
+
end
|
369
|
+
|
370
|
+
# Returns an array of spans representing noun chunks.
|
371
|
+
# @return [Array<Span>]
|
372
|
+
def noun_chunks
|
373
|
+
chunk_array = []
|
374
|
+
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
375
|
+
py_chunks.each do |py_chunk|
|
376
|
+
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
377
|
+
end
|
378
|
+
chunk_array
|
379
|
+
end
|
380
|
+
|
381
|
+
# Returns an array of spans representing sentences.
|
382
|
+
# @return [Array<Span>]
|
383
|
+
def sents
|
384
|
+
sentence_array = []
|
385
|
+
py_sentences = PyCall::List.(@py_doc.sents)
|
386
|
+
py_sentences.each do |py_sent|
|
387
|
+
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
388
|
+
end
|
389
|
+
sentence_array
|
390
|
+
end
|
391
|
+
|
392
|
+
# Returns an array of spans representing named entities.
|
393
|
+
# @return [Array<Span>]
|
394
|
+
def ents
|
395
|
+
# so that ents canbe "each"-ed in Ruby
|
396
|
+
ent_array = []
|
397
|
+
PyCall::List.(@py_doc.ents).each do |ent|
|
398
|
+
ent_array << ent
|
399
|
+
end
|
400
|
+
ent_array
|
401
|
+
end
|
402
|
+
|
403
|
+
# Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
|
404
|
+
# @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
|
405
|
+
def [](range)
|
406
|
+
if range.is_a?(Range)
|
407
|
+
py_span = @py_doc[range]
|
408
|
+
return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
409
|
+
else
|
410
|
+
return Token.new(@py_doc[range])
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
# Returns a semantic similarity estimate.
|
415
|
+
# @param other [Doc] the other doc to which a similarity estimation is made
|
416
|
+
# @return [Float]
|
417
|
+
def similarity(other)
|
418
|
+
PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
|
419
|
+
end
|
420
|
+
|
421
|
+
# Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
|
422
|
+
# @param style [String] Either `dep` or `ent`
|
423
|
+
# @param compact [Boolean] Only relevant to the `dep' style
|
424
|
+
# @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
|
425
|
+
def displacy(style: "dep", compact: false)
|
426
|
+
PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
|
427
|
+
end
|
428
|
+
|
429
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
430
|
+
def method_missing(name, *args)
|
431
|
+
@py_doc.send(name, *args)
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
436
|
+
class Matcher
|
437
|
+
|
438
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
439
|
+
attr_reader :spacy_matcher_id
|
440
|
+
|
441
|
+
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
442
|
+
attr_reader :py_matcher
|
443
|
+
|
444
|
+
# Creates a {Matcher} instance
|
445
|
+
# @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
|
446
|
+
def initialize(nlp_id)
|
447
|
+
@spacy_matcher_id = "doc_#{nlp_id}_matcher"
|
448
|
+
PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
|
449
|
+
@py_matcher = PyCall.eval(@spacy_matcher_id)
|
450
|
+
end
|
451
|
+
|
452
|
+
# Adds a label string and a text pattern.
|
453
|
+
# @param text [String] a label string given to the pattern
|
454
|
+
# @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
|
455
|
+
def add(text, pattern)
|
456
|
+
@py_matcher.add(text, pattern)
|
457
|
+
end
|
458
|
+
|
459
|
+
# Execute the match.
|
460
|
+
# @param doc [Doc] An {Doc} instance
|
461
|
+
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
|
462
|
+
def match(doc)
|
463
|
+
str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
|
464
|
+
s = StringScanner.new(str_results[1..-2])
|
465
|
+
results = []
|
466
|
+
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
467
|
+
next unless s.matched
|
468
|
+
triple = s.matched.split(", ")
|
469
|
+
match_id = triple[0].to_i
|
470
|
+
start_index = triple[1].to_i
|
471
|
+
end_index = triple[2].to_i - 1
|
472
|
+
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
473
|
+
end
|
474
|
+
results
|
475
|
+
end
|
476
|
+
end
|
477
|
+
|
478
|
+
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
479
|
+
class Language
|
480
|
+
|
481
|
+
# @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
|
482
|
+
attr_reader :spacy_nlp_id
|
483
|
+
|
484
|
+
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
485
|
+
attr_reader :py_nlp
|
486
|
+
|
487
|
+
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
488
|
+
# @param model [String] A language model installed in the system
|
489
|
+
def initialize(model = "en_core_web_sm")
|
490
|
+
@spacy_nlp_id = "nlp_#{model.object_id}"
|
491
|
+
PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
|
492
|
+
PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
|
493
|
+
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
494
|
+
end
|
495
|
+
|
496
|
+
# Reads and analyze the given text.
|
497
|
+
# @param text [String] A text to be read and analyzed
|
498
|
+
def read(text)
|
499
|
+
Doc.new(@spacy_nlp_id, text)
|
500
|
+
end
|
501
|
+
|
502
|
+
# Generates a matcher for the current language model.
|
503
|
+
# @return [Matcher]
|
504
|
+
def matcher
|
505
|
+
Matcher.new(@spacy_nlp_id)
|
506
|
+
end
|
507
|
+
|
508
|
+
# A utility method to lookup a vocabulary item of the given id.
|
509
|
+
# @param id [Integer] A vocabulary id
|
510
|
+
# @return [Object] A Python `Lexeme` object
|
511
|
+
def vocab_string_lookup(id)
|
512
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
513
|
+
end
|
514
|
+
|
515
|
+
# A utility method to list pipeline components.
|
516
|
+
# @return [Array<String>] An array of text strings representing pipeline components
|
517
|
+
def pipe_names
|
518
|
+
pipe_array = []
|
519
|
+
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
520
|
+
pipe_array << pipe
|
521
|
+
end
|
522
|
+
pipe_array
|
523
|
+
end
|
524
|
+
|
525
|
+
# A utility method to get the tokenizer Python object.
|
526
|
+
# @return [Object] Python `Tokenizer` object
|
527
|
+
def tokenizer
|
528
|
+
return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
|
529
|
+
end
|
530
|
+
|
531
|
+
# A utility method to get a Python `Lexeme` object.
|
532
|
+
# @param text [String] A text string representing a lexeme
|
533
|
+
# @return [Object] Python `Tokenizer` object
|
534
|
+
def get_lexeme(text)
|
535
|
+
text = text.gsub("'", "\'")
|
536
|
+
py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
|
537
|
+
return py_lexeme
|
538
|
+
end
|
539
|
+
|
540
|
+
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
541
|
+
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
542
|
+
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
543
|
+
def most_similar(vector, n)
|
544
|
+
vec_array = Numpy.asarray([vector])
|
545
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
|
546
|
+
key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
|
547
|
+
keys = key_texts.map{|kt| kt[0]}
|
548
|
+
texts = key_texts.map{|kt| kt[1]}
|
549
|
+
best_rows = PyCall::List.(py_result[1])[0]
|
550
|
+
scores = PyCall::List.(py_result[2])[0]
|
551
|
+
|
552
|
+
results = []
|
553
|
+
n.times do |i|
|
554
|
+
results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
|
555
|
+
end
|
556
|
+
|
557
|
+
results
|
558
|
+
end
|
559
|
+
|
560
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
|
561
|
+
def method_missing(name, *args)
|
562
|
+
@py_nlp.send(name, *args)
|
563
|
+
end
|
564
|
+
end
|
565
|
+
|
566
|
+
end
|
567
|
+
|