ruby-spacy 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +58 -0
  3. data/.yardopts +2 -0
  4. data/Gemfile +18 -0
  5. data/Gemfile.lock +39 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +498 -0
  8. data/Rakefile +12 -0
  9. data/bin/console +15 -0
  10. data/bin/setup +8 -0
  11. data/examples/get_started/lexeme.rb +24 -0
  12. data/examples/get_started/linguistic_annotations.rb +32 -0
  13. data/examples/get_started/most_similar.rb +46 -0
  14. data/examples/get_started/named_entities.rb +24 -0
  15. data/examples/get_started/outputs/test_dep.svg +84 -0
  16. data/examples/get_started/outputs/test_dep_compact.svg +84 -0
  17. data/examples/get_started/outputs/test_ent.html +11 -0
  18. data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
  19. data/examples/get_started/similarity.rb +13 -0
  20. data/examples/get_started/tokenization.rb +22 -0
  21. data/examples/get_started/visualizing_dependencies.rb +14 -0
  22. data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
  23. data/examples/get_started/visualizing_named_entities.rb +12 -0
  24. data/examples/get_started/vocab.rb +10 -0
  25. data/examples/get_started/word_vectors.rb +24 -0
  26. data/examples/japanese/ancestors.rb +44 -0
  27. data/examples/japanese/entity_annotations_and_labels.rb +45 -0
  28. data/examples/japanese/information_extraction.rb +27 -0
  29. data/examples/japanese/lemmatization.rb +32 -0
  30. data/examples/japanese/most_similar.rb +46 -0
  31. data/examples/japanese/named_entity_recognition.rb +27 -0
  32. data/examples/japanese/navigating_parse_tree.rb +34 -0
  33. data/examples/japanese/noun_chunks.rb +23 -0
  34. data/examples/japanese/outputs/test_dep.svg +149 -0
  35. data/examples/japanese/outputs/test_ent.html +16 -0
  36. data/examples/japanese/pos_tagging.rb +34 -0
  37. data/examples/japanese/sentence_segmentation.rb +16 -0
  38. data/examples/japanese/similarity.rb +12 -0
  39. data/examples/japanese/tokenization.rb +38 -0
  40. data/examples/japanese/visualizing_dependencies.rb +13 -0
  41. data/examples/japanese/visualizing_named_entities.rb +14 -0
  42. data/examples/linguistic_features/ancestors.rb +41 -0
  43. data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
  44. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
  45. data/examples/linguistic_features/information_extraction.rb +36 -0
  46. data/examples/linguistic_features/iterating_children.rb +24 -0
  47. data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
  48. data/examples/linguistic_features/lemmatization.rb +31 -0
  49. data/examples/linguistic_features/morphology.rb +17 -0
  50. data/examples/linguistic_features/named_entity_recognition.rb +25 -0
  51. data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
  52. data/examples/linguistic_features/noun_chunks.rb +27 -0
  53. data/examples/linguistic_features/outputs/test_ent.html +11 -0
  54. data/examples/linguistic_features/pos_tagging.rb +31 -0
  55. data/examples/linguistic_features/retokenize_1.rb +29 -0
  56. data/examples/linguistic_features/retokenize_2.rb +16 -0
  57. data/examples/linguistic_features/rule_based_morphology.rb +12 -0
  58. data/examples/linguistic_features/sentence_segmentation.rb +16 -0
  59. data/examples/linguistic_features/similarity.rb +14 -0
  60. data/examples/linguistic_features/similarity_between_spans.rb +23 -0
  61. data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
  62. data/examples/linguistic_features/tokenization.rb +23 -0
  63. data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
  64. data/examples/rule_based_matching/matcher.rb +19 -0
  65. data/lib/ruby-spacy.rb +567 -0
  66. data/lib/ruby-spacy/version.rb +6 -0
  67. data/ruby-spacy.gemspec +42 -0
  68. metadata +157 -0
@@ -0,0 +1,16 @@
1
+ <div class="entities" style="line-height: 2.5; direction: ltr">
2
+ <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
3
+ セバスチアン・スラン
4
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
5
+ </mark>
6
+
7
+ <mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
8
+ 2007年
9
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>
10
+ </mark>
11
+
12
+ <mark class="entity" style="background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
13
+ グーグル
14
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PRODUCT</span>
15
+ </mark>
16
+ で自動運転車に取り組み始めたとき、社外の人間で彼のことを真剣に捉えている者はほとんどいなかった。</div>
@@ -0,0 +1,34 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("ja_core_news_lg")
5
+ doc = nlp.read("任天堂は1983年にファミコンを14,800円で発売した。")
6
+
7
+ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
8
+ rows = []
9
+
10
+ doc.each do |token|
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
12
+ end
13
+
14
+ table = Terminal::Table.new rows: rows, headings: headings
15
+ puts table
16
+
17
+ # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
18
+ # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
+ # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
20
+ # | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
21
+ # | は | は | ADP | 助詞-係助詞 | case | x | true | true |
22
+ # | 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
23
+ # | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
24
+ # | に | に | ADP | 助詞-格助詞 | case | x | true | true |
25
+ # | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
26
+ # | を | を | ADP | 助詞-格助詞 | case | x | true | true |
27
+ # | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
28
+ # | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
29
+ # | で | で | ADP | 助詞-格助詞 | case | x | true | true |
30
+ # | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
31
+ # | し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
32
+ # | た | た | AUX | 助動詞 | aux | x | true | true |
33
+ # | 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
34
+ # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+
3
+ nlp = Spacy::Language.new("ja_core_news_sm")
4
+
5
+ doc = nlp.read("これは文です。今私は「これは文です」と言いました。")
6
+
7
+
8
+ puts "doc has annotation SENT_START: " + doc.has_annotation("SENT_START").to_s
9
+
10
+ doc.sents.each do |sent|
11
+ puts sent.text
12
+ end
13
+
14
+ # doc has annotation SENT_START: true
15
+ # これは文です。
16
+ # 今私は「これは文です」と言いました。
@@ -0,0 +1,12 @@
1
+ require "ruby-spacy"
2
+
3
+ nlp = Spacy::Language.new("ja_core_news_lg")
4
+ ja_doc1 = nlp.read("今日は雨ばっかり降って、嫌な天気ですね。")
5
+ puts "doc1: #{ja_doc1.text}"
6
+ ja_doc2 = nlp.read("あいにくの悪天候で残念です。")
7
+ puts "doc2: #{ja_doc2.text}"
8
+ puts "Similarity: #{ja_doc1.similarity(ja_doc2)}"
9
+
10
+ # doc1: 今日は雨ばっかり降って、嫌な天気ですね。
11
+ # doc2: あいにくの悪天候で残念です。
12
+ # Similarity: 0.8684192637149641
@@ -0,0 +1,38 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("ja_core_news_sm")
5
+
6
+ doc = nlp.read("アップルはイギリスの新興企業を10億ドルで買収しようとしている。")
7
+
8
+ headings = ["text"]
9
+ rows = []
10
+
11
+ doc.each do |token|
12
+ rows << [token.text]
13
+ end
14
+
15
+ table = Terminal::Table.new rows: rows, headings: headings
16
+ puts table
17
+
18
+ # +----------+
19
+ # | text |
20
+ # +----------+
21
+ # | アップル |
22
+ # | は |
23
+ # | イギリス |
24
+ # | の |
25
+ # | 新興 |
26
+ # | 企業 |
27
+ # | を |
28
+ # | 10億 |
29
+ # | ドル |
30
+ # | で |
31
+ # | 買収 |
32
+ # | しよう |
33
+ # | と |
34
+ # | し |
35
+ # | て |
36
+ # | いる |
37
+ # | 。 |
38
+ # +----------+
@@ -0,0 +1,13 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("ja_core_news_sm")
5
+
6
+ sentence = "自動運転車は保険責任を製造者に転嫁する。"
7
+ doc = nlp.read(sentence)
8
+
9
+ dep_svg = doc.displacy('dep', false)
10
+
11
+ File.open(File.join(File.dirname(__FILE__), "outputs/test_dep.svg"), "w") do |file|
12
+ file.write(dep_svg)
13
+ end
@@ -0,0 +1,14 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("ja_core_news_lg")
5
+
6
+ sentence ="セバスチアン・スランが2007年にグーグルで自動運転車に取り組み始めたとき、社外の人間で彼のことを真剣に捉えている者はほとんどいなかった。"
7
+
8
+ doc = nlp.read(sentence)
9
+
10
+ ent_html = doc.displacy('ent')
11
+
12
+ File.open(File.join(File.dirname(__FILE__), "outputs/test_ent.html"), "w") do |file|
13
+ file.write(ent_html)
14
+ end
@@ -0,0 +1,41 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "Credit and mortgage account holders must submit their requests"
7
+ doc = nlp.read(sentence)
8
+
9
+ headings = ["text", "dep", "n_lefts", "n_rights", "ancestors"]
10
+ rows = []
11
+
12
+ root = doc.tokens.select do |t|
13
+ # need to compare token and its head using their indices
14
+ t.i == t.head.i
15
+ end.first
16
+
17
+ puts "The sentence: " + sentence
18
+
19
+ subject = Spacy::Token.new(root.lefts[0])
20
+
21
+ puts "The root of the sentence is: " + root.text
22
+ puts "The subject of the sentence is: " + subject.text
23
+
24
+ subject.subtree.each do |descendant|
25
+ # need to convert "ancestors" object from a python generator to a ruby array
26
+ ancestors = Spacy::generator_to_array(descendant.ancestors)
27
+ rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
28
+ end
29
+
30
+ table = Terminal::Table.new rows: rows, headings: headings
31
+ print table
32
+
33
+ # +----------+----------+---------+----------+------------------------------------+
34
+ # | text | dep | n_lefts | n_rights | ancestors |
35
+ # +----------+----------+---------+----------+------------------------------------+
36
+ # | Credit | nmod | 0 | 2 | [holders, submit] |
37
+ # | and | cc | 0 | 0 | [Credit, holders, submit] |
38
+ # | mortgage | compound | 0 | 0 | [account, Credit, holders, submit] |
39
+ # | account | conj | 1 | 0 | [Credit, holders, submit] |
40
+ # | holders | nsubj | 1 | 0 | [submit] |
41
+ # +----------+----------+---------+----------+------------------------------------+
@@ -0,0 +1,29 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "San Francisco considers banning sidewalk delivery robots"
7
+ doc = nlp.read(sentence)
8
+
9
+ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
10
+ rows = []
11
+
12
+ doc.each do |ent|
13
+ rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type_]
14
+ end
15
+
16
+ table = Terminal::Table.new rows: rows, headings: headings
17
+ print table
18
+
19
+ # +-----------+---------+----------+-----------+
20
+ # | text | ent_iob | ent_iob_ | ent_type_ |
21
+ # +-----------+---------+----------+-----------+
22
+ # | San | 3 | B | GPE |
23
+ # | Francisco | 1 | I | GPE |
24
+ # | considers | 2 | O | |
25
+ # | banning | 2 | O | |
26
+ # | sidewalk | 2 | O | |
27
+ # | delivery | 2 | O | |
28
+ # | robots | 2 | O | |
29
+ # +-----------+---------+----------+-----------+
@@ -0,0 +1,20 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
7
+
8
+
9
+ results = []
10
+
11
+ doc.each do |token|
12
+ if token.dep_ == "nsubj" && token.head.pos_ == "VERB"
13
+ results << token.head
14
+ end
15
+ end
16
+
17
+ puts results.to_s
18
+
19
+ # [shift]
20
+
@@ -0,0 +1,36 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ nlp.add_pipe("merge_entities")
7
+ nlp.add_pipe("merge_noun_chunks")
8
+
9
+ sentence = "Credit and mortgage account holders must submit their requests"
10
+ doc = nlp.read(sentence)
11
+
12
+ texts = [
13
+ "Net income was $9.4 million compared to the prior year of $2.7 million.",
14
+ "Revenue exceeded twelve billion dollars, with a loss of $1b.",
15
+ ]
16
+
17
+ texts.each do |text|
18
+ doc = nlp.read(text)
19
+ doc.each do |token|
20
+ if token.ent_type_ == "MONEY"
21
+ if ["attr", "dobj"].index token.dep_
22
+ subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep_ == "nsubj"}
23
+ if !subj.empty?
24
+ puts(subj[0].text + " --> " + token.text)
25
+ end
26
+ elsif token.dep_ == "pobj" and token.head.dep_ == "prep"
27
+ puts token.head.head.text + " --> " + token.text
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ # Net income --> $9.4 million
34
+ # the prior year --> $2.7 million
35
+ # Revenue --> twelve billion dollars
36
+ # a loss --> 1b
@@ -0,0 +1,24 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
7
+
8
+
9
+ results = []
10
+
11
+ doc.each do |token|
12
+ if token.pos_ == "VERB"
13
+ token.children.each do |child|
14
+ if child.dep_ == "nsubj"
15
+ results << child.head
16
+ end
17
+ end
18
+ end
19
+ end
20
+
21
+ puts results.to_s
22
+
23
+ # [shift]
24
+
@@ -0,0 +1,20 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("bright red apples on the tree")
7
+
8
+ puts "Text: " + doc
9
+
10
+ puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
11
+ puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
12
+
13
+ puts "Num of the words to the left of 'apple': " + doc[2].n_lefts.to_s
14
+ puts "Num of the words to the right of 'apple': " + doc[2].n_rights.to_s
15
+
16
+ # Text: bright red apples on the tree
17
+ # Words to the left of 'apple': [bright, red]
18
+ # Words to the right of 'apple': [on]
19
+ # Num of the words to the left of 'apple': 2
20
+ # Num of the words to the right of 'apple': 1
@@ -0,0 +1,31 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ lemmatizer = nlp.get_pipe("lemmatizer")
7
+ puts "Lemmatizer mode: " + lemmatizer.mode
8
+
9
+ doc = nlp.read("I was reading the paper.")
10
+
11
+ headings = ["lemma"]
12
+ rows = []
13
+
14
+ doc.each do |token|
15
+ rows << [token.lemma_]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # Lemmatizer mode: rule
22
+ # +-------+
23
+ # | lemma |
24
+ # +-------+
25
+ # | I |
26
+ # | be |
27
+ # | read |
28
+ # | the |
29
+ # | paper |
30
+ # | . |
31
+ # +-------+
@@ -0,0 +1,17 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ puts "Pipeline: " + nlp.pipe_names.to_s
7
+
8
+ doc = nlp.read("I was reading the paper.")
9
+
10
+ token = doc[0]
11
+
12
+ puts "Morph features of the first word: " + token.morph.to_s
13
+ puts "PronType of the word: " + token.morph.get("PronType").to_s
14
+
15
+ # Pipeline: ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
16
+ # Morph features of the first word: Case=Nom|Number=Sing|Person=1|PronType=Prs
17
+ # PronType of the word: ['Prs']
@@ -0,0 +1,25 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "Apple is looking at buying U.K. startup for $1 billion"
7
+ doc = nlp.read(sentence)
8
+
9
+ headings = ["text", "start", "end", "label"]
10
+ rows = []
11
+
12
+ doc.ents.each do |ent|
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
14
+ end
15
+
16
+ table = Terminal::Table.new rows: rows, headings: headings
17
+ puts table
18
+
19
+ # +------------+-------+-----+-------+
20
+ # | text | start | end | label |
21
+ # +------------+-------+-----+-------+
22
+ # | Apple | 0 | 5 | ORG |
23
+ # | U.K. | 27 | 31 | GPE |
24
+ # | $1 billion | 44 | 54 | MONEY |
25
+ # +------------+-------+-----+-------+
@@ -0,0 +1,32 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ lemmatizer = nlp.get_pipe("lemmatizer")
7
+ puts "Lemmatizer mode: " + lemmatizer.mode
8
+
9
+ doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
10
+
11
+ headings = ["text", "dep", "head text", "head pos", "children"]
12
+ rows = []
13
+
14
+ doc.each do |token|
15
+ rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # Lemmatizer mode: rule
22
+ # +---------------+----------+-----------+----------+---------------------------+
23
+ # | text | dep | head text | head pos | children |
24
+ # +---------------+----------+-----------+----------+---------------------------+
25
+ # | Autonomous | amod | cars | NOUN | [] |
26
+ # | cars | nsubj | shift | VERB | [Autonomous] |
27
+ # | shift | ROOT | shift | VERB | [cars, liability, toward] |
28
+ # | insurance | compound | liability | NOUN | [] |
29
+ # | liability | dobj | shift | VERB | [insurance] |
30
+ # | toward | prep | shift | VERB | [manufacturers] |
31
+ # | manufacturers | pobj | toward | ADP | [] |
32
+ # +---------------+----------+-----------+----------+---------------------------+