RubyGems - ruby-spacy - Versions diffs - 0.1.0 - Mend

ruby-spacy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

checksums.yaml +7 -0
data/.gitignore +58 -0
data/.yardopts +2 -0
data/Gemfile +18 -0
data/Gemfile.lock +39 -0
data/LICENSE.txt +21 -0
data/README.md +498 -0
data/Rakefile +12 -0
data/bin/console +15 -0
data/bin/setup +8 -0
data/examples/get_started/lexeme.rb +24 -0
data/examples/get_started/linguistic_annotations.rb +32 -0
data/examples/get_started/most_similar.rb +46 -0
data/examples/get_started/named_entities.rb +24 -0
data/examples/get_started/outputs/test_dep.svg +84 -0
data/examples/get_started/outputs/test_dep_compact.svg +84 -0
data/examples/get_started/outputs/test_ent.html +11 -0
data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
data/examples/get_started/similarity.rb +13 -0
data/examples/get_started/tokenization.rb +22 -0
data/examples/get_started/visualizing_dependencies.rb +14 -0
data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
data/examples/get_started/visualizing_named_entities.rb +12 -0
data/examples/get_started/vocab.rb +10 -0
data/examples/get_started/word_vectors.rb +24 -0
data/examples/japanese/ancestors.rb +44 -0
data/examples/japanese/entity_annotations_and_labels.rb +45 -0
data/examples/japanese/information_extraction.rb +27 -0
data/examples/japanese/lemmatization.rb +32 -0
data/examples/japanese/most_similar.rb +46 -0
data/examples/japanese/named_entity_recognition.rb +27 -0
data/examples/japanese/navigating_parse_tree.rb +34 -0
data/examples/japanese/noun_chunks.rb +23 -0
data/examples/japanese/outputs/test_dep.svg +149 -0
data/examples/japanese/outputs/test_ent.html +16 -0
data/examples/japanese/pos_tagging.rb +34 -0
data/examples/japanese/sentence_segmentation.rb +16 -0
data/examples/japanese/similarity.rb +12 -0
data/examples/japanese/tokenization.rb +38 -0
data/examples/japanese/visualizing_dependencies.rb +13 -0
data/examples/japanese/visualizing_named_entities.rb +14 -0
data/examples/linguistic_features/ancestors.rb +41 -0
data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
data/examples/linguistic_features/information_extraction.rb +36 -0
data/examples/linguistic_features/iterating_children.rb +24 -0
data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
data/examples/linguistic_features/lemmatization.rb +31 -0
data/examples/linguistic_features/morphology.rb +17 -0
data/examples/linguistic_features/named_entity_recognition.rb +25 -0
data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
data/examples/linguistic_features/noun_chunks.rb +27 -0
data/examples/linguistic_features/outputs/test_ent.html +11 -0
data/examples/linguistic_features/pos_tagging.rb +31 -0
data/examples/linguistic_features/retokenize_1.rb +29 -0
data/examples/linguistic_features/retokenize_2.rb +16 -0
data/examples/linguistic_features/rule_based_morphology.rb +12 -0
data/examples/linguistic_features/sentence_segmentation.rb +16 -0
data/examples/linguistic_features/similarity.rb +14 -0
data/examples/linguistic_features/similarity_between_spans.rb +23 -0
data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
data/examples/linguistic_features/tokenization.rb +23 -0
data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
data/examples/rule_based_matching/matcher.rb +19 -0
data/lib/ruby-spacy.rb +567 -0
data/lib/ruby-spacy/version.rb +6 -0
data/ruby-spacy.gemspec +42 -0
metadata +157 -0

data/examples/japanese/outputs/test_ent.html ADDED Viewed

@@ -0,0 +1,16 @@
+<div class="entities" style="line-height: 2.5; direction: ltr">
+<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    セバスチアン・スラン
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
+</mark>
+が
+<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    2007年
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>
+</mark>
+に
+<mark class="entity" style="background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    グーグル
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PRODUCT</span>
+</mark>
+で自動運転車に取り組み始めたとき、社外の人間で彼のことを真剣に捉えている者はほとんどいなかった。</div>

data/examples/japanese/pos_tagging.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("ja_core_news_lg")
+doc = nlp.read("任天堂は1983年にファミコンを14,800円で発売した。")
+headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
+rows = []
+doc.each do |token|
+  rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+puts table
+# +------------+------------+-------+--------------------------+--------+--------+----------+---------+
+# | text       | lemma      | pos   | tag                      | dep    | shape  | is_alpha | is_stop |
+# +------------+------------+-------+--------------------------+--------+--------+----------+---------+
+# | 任天堂     | 任天堂     | PROPN | 名詞-固有名詞-一般       | nsubj  | xxx    | true     | false   |
+# | は         | は         | ADP   | 助詞-係助詞              | case   | x      | true     | true    |
+# | 1983       | 1983       | NUM   | 名詞-数詞                | nummod | dddd   | false    | false   |
+# | 年         | 年         | NOUN  | 名詞-普通名詞-助数詞可能 | obl    | x      | true     | false   |
+# | に         | に         | ADP   | 助詞-格助詞              | case   | x      | true     | true    |
+# | ファミコン | ファミコン | NOUN  | 名詞-普通名詞-一般       | obj    | xxxx   | true     | false   |
+# | を         | を         | ADP   | 助詞-格助詞              | case   | x      | true     | true    |
+# | 14,800     | 14,800     | NUM   | 名詞-数詞                | fixed  | dd,ddd | false    | false   |
+# | 円         | 円         | NOUN  | 名詞-普通名詞-助数詞可能 | obl    | x      | true     | false   |
+# | で         | で         | ADP   | 助詞-格助詞              | case   | x      | true     | true    |
+# | 発売       | 発売       | VERB  | 名詞-普通名詞-サ変可能   | ROOT   | xx     | true     | false   |
+# | し         | する       | AUX   | 動詞-非自立可能          | aux    | x      | true     | true    |
+# | た         | た         | AUX   | 助動詞                   | aux    | x      | true     | true    |
+# | 。         | 。         | PUNCT | 補助記号-句点            | punct  | 。     | false    | false   |
+# +------------+------------+-------+--------------------------+--------+--------+----------+---------+

data/examples/japanese/sentence_segmentation.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require "ruby-spacy"
+nlp = Spacy::Language.new("ja_core_news_sm")
+doc = nlp.read("これは文です。今私は「これは文です」と言いました。")
+puts "doc has annotation SENT_START: " + doc.has_annotation("SENT_START").to_s
+doc.sents.each do |sent|
+  puts sent.text
+end
+# doc has annotation SENT_START: true
+# これは文です。
+# 今私は「これは文です」と言いました。

data/examples/japanese/similarity.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require "ruby-spacy"
+nlp = Spacy::Language.new("ja_core_news_lg")
+ja_doc1 = nlp.read("今日は雨ばっかり降って、嫌な天気ですね。")
+puts "doc1: #{ja_doc1.text}"
+ja_doc2 = nlp.read("あいにくの悪天候で残念です。")
+puts "doc2: #{ja_doc2.text}"
+puts "Similarity: #{ja_doc1.similarity(ja_doc2)}"
+# doc1: 今日は雨ばっかり降って、嫌な天気ですね。
+# doc2: あいにくの悪天候で残念です。
+# Similarity: 0.8684192637149641

data/examples/japanese/tokenization.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("ja_core_news_sm")
+doc = nlp.read("アップルはイギリスの新興企業を10億ドルで買収しようとしている。")
+headings = ["text"]
+rows = []
+doc.each do |token|
+  rows << [token.text]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+puts table
+# +----------+
+# | text     |
+# +----------+
+# | アップル |
+# | は       |
+# | イギリス |
+# | の       |
+# | 新興     |
+# | 企業     |
+# | を       |
+# | 10億     |
+# | ドル     |
+# | で       |
+# | 買収     |
+# | しよう   |
+# | と       |
+# | し       |
+# | て       |
+# | いる     |
+# | 。       |
+# +----------+

data/examples/japanese/visualizing_dependencies.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("ja_core_news_sm")
+sentence = "自動運転車は保険責任を製造者に転嫁する。"
+doc = nlp.read(sentence)
+dep_svg = doc.displacy('dep', false)
+File.open(File.join(File.dirname(__FILE__), "outputs/test_dep.svg"), "w") do |file|
+  file.write(dep_svg)
+end

data/examples/japanese/visualizing_named_entities.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("ja_core_news_lg")
+sentence ="セバスチアン・スランが2007年にグーグルで自動運転車に取り組み始めたとき、社外の人間で彼のことを真剣に捉えている者はほとんどいなかった。"
+doc = nlp.read(sentence)
+ent_html = doc.displacy('ent')
+File.open(File.join(File.dirname(__FILE__), "outputs/test_ent.html"), "w") do |file|
+  file.write(ent_html)
+end

data/examples/linguistic_features/ancestors.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+sentence = "Credit and mortgage account holders must submit their requests"
+doc = nlp.read(sentence)
+headings = ["text", "dep", "n_lefts", "n_rights", "ancestors"]
+rows = []
+root = doc.tokens.select do |t|
+  # need to compare token and its head using their indices
+  t.i == t.head.i
+end.first
+puts "The sentence: " + sentence
+subject = Spacy::Token.new(root.lefts[0])
+puts "The root of the sentence is: " + root.text
+puts "The subject of the sentence is: " + subject.text
+subject.subtree.each do |descendant|
+  # need to convert "ancestors" object from a python generator to a ruby array
+  ancestors = Spacy::generator_to_array(descendant.ancestors)
+  rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+print table
+# +----------+----------+---------+----------+------------------------------------+
+# | text     | dep      | n_lefts | n_rights | ancestors                          |
+# +----------+----------+---------+----------+------------------------------------+
+# | Credit   | nmod     | 0       | 2        | [holders, submit]                  |
+# | and      | cc       | 0       | 0        | [Credit, holders, submit]          |
+# | mortgage | compound | 0       | 0        | [account, Credit, holders, submit] |
+# | account  | conj     | 1       | 0        | [Credit, holders, submit]          |
+# | holders  | nsubj    | 1       | 0        | [submit]                           |
+# +----------+----------+---------+----------+------------------------------------+

data/examples/linguistic_features/entity_annotations_and_labels.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+sentence = "San Francisco considers banning sidewalk delivery robots"
+doc = nlp.read(sentence)
+headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
+rows = []
+doc.each do |ent|
+  rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type_]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+print table
+# +-----------+---------+----------+-----------+
+# | text      | ent_iob | ent_iob_ | ent_type_ |
+# +-----------+---------+----------+-----------+
+# | San       | 3       | B        | GPE       |
+# | Francisco | 1       | I        | GPE       |
+# | considers | 2       | O        |           |
+# | banning   | 2       | O        |           |
+# | sidewalk  | 2       | O        |           |
+# | delivery  | 2       | O        |           |
+# | robots    | 2       | O        |           |
+# +-----------+---------+----------+-----------+

data/examples/linguistic_features/finding_a_verb_with_a_subject.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
+results = []
+doc.each do |token|
+  if token.dep_ == "nsubj" && token.head.pos_ == "VERB"
+    results << token.head
+  end
+end
+puts results.to_s
+# [shift]

data/examples/linguistic_features/information_extraction.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+nlp.add_pipe("merge_entities")
+nlp.add_pipe("merge_noun_chunks")
+sentence = "Credit and mortgage account holders must submit their requests"
+doc = nlp.read(sentence)
+texts = [
+    "Net income was $9.4 million compared to the prior year of $2.7 million.",
+    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
+]
+texts.each do |text|
+  doc = nlp.read(text)
+  doc.each do |token|
+    if token.ent_type_ == "MONEY"
+      if ["attr", "dobj"].index token.dep_
+        subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep_ == "nsubj"}
+        if !subj.empty?
+          puts(subj[0].text + " --> " + token.text)
+        end
+      elsif token.dep_ == "pobj" and token.head.dep_ == "prep"
+        puts token.head.head.text + " --> " + token.text
+      end
+    end
+  end
+end
+# Net income --> $9.4 million
+# the prior year --> $2.7 million
+# Revenue --> twelve billion dollars
+# a loss --> 1b

data/examples/linguistic_features/iterating_children.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
+results = []
+doc.each do |token|
+  if token.pos_ == "VERB"
+    token.children.each do |child|
+      if child.dep_ == "nsubj"
+        results << child.head
+      end
+    end
+  end
+end
+puts results.to_s
+# [shift]

data/examples/linguistic_features/iterating_lefts_and_rights.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+doc = nlp.read("bright red apples on the tree")
+puts "Text: " + doc
+puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
+puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
+puts "Num of the words to the left of 'apple': " + doc[2].n_lefts.to_s
+puts "Num of the words to the right of 'apple': " + doc[2].n_rights.to_s
+# Text: bright red apples on the tree
+# Words to the left of 'apple': [bright, red]
+# Words to the right of 'apple': [on]
+# Num of the words to the left of 'apple': 2
+# Num of the words to the right of 'apple': 1

data/examples/linguistic_features/lemmatization.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+lemmatizer = nlp.get_pipe("lemmatizer")
+puts "Lemmatizer mode: " + lemmatizer.mode
+doc = nlp.read("I was reading the paper.")
+headings = ["lemma"]
+rows = []
+doc.each do |token|
+  rows << [token.lemma_]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+puts table
+# Lemmatizer mode: rule
+# +-------+
+# | lemma |
+# +-------+
+# | I     |
+# | be    |
+# | read  |
+# | the   |
+# | paper |
+# | .     |
+# +-------+

data/examples/linguistic_features/morphology.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+puts "Pipeline: " + nlp.pipe_names.to_s
+doc = nlp.read("I was reading the paper.")
+token = doc[0]
+puts "Morph features of the first word: " + token.morph.to_s
+puts "PronType of the word: " + token.morph.get("PronType").to_s
+# Pipeline: ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
+# Morph features of the first word: Case=Nom|Number=Sing|Person=1|PronType=Prs
+# PronType of the word: ['Prs']

data/examples/linguistic_features/named_entity_recognition.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+sentence = "Apple is looking at buying U.K. startup for $1 billion"
+doc = nlp.read(sentence)
+headings = ["text", "start", "end", "label"]
+rows = []
+doc.ents.each do |ent|
+  rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+puts table
+# +------------+-------+-----+-------+
+# | text       | start | end | label |
+# +------------+-------+-----+-------+
+# | Apple      | 0     | 5   | ORG   |
+# | U.K.       | 27    | 31  | GPE   |
+# | $1 billion | 44    | 54  | MONEY |
+# +------------+-------+-----+-------+

data/examples/linguistic_features/navigating_parse_tree.rb ADDED Viewed

@@ -0,0 +1,32 @@
+require "ruby-spacy"
+require "terminal-table"
+nlp = Spacy::Language.new("en_core_web_sm")
+lemmatizer = nlp.get_pipe("lemmatizer")
+puts "Lemmatizer mode: " + lemmatizer.mode
+doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
+headings = ["text", "dep", "head text", "head pos", "children"]
+rows = []
+doc.each do |token|
+  rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
+end
+table = Terminal::Table.new rows: rows, headings: headings
+puts table
+# Lemmatizer mode: rule
+# +---------------+----------+-----------+----------+---------------------------+
+# | text          | dep      | head text | head pos | children                  |
+# +---------------+----------+-----------+----------+---------------------------+
+# | Autonomous    | amod     | cars      | NOUN     | []                        |
+# | cars          | nsubj    | shift     | VERB     | [Autonomous]              |
+# | shift         | ROOT     | shift     | VERB     | [cars, liability, toward] |
+# | insurance     | compound | liability | NOUN     | []                        |
+# | liability     | dobj     | shift     | VERB     | [insurance]               |
+# | toward        | prep     | shift     | VERB     | [manufacturers]           |
+# | manufacturers | pobj     | toward    | ADP      | []                        |
+# +---------------+----------+-----------+----------+---------------------------+