RubyGems - ruby-spacy - Versions diffs - 0.1.2 → 0.1.3 - Mend

ruby-spacy 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -0
data/Gemfile.lock +1 -1
data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
data/examples/get_started/similarity.rb +2 -2
data/examples/japanese/visualizing_dependencies.rb +2 -2
data/examples/japanese/visualizing_named_entities.rb +1 -1
data/examples/linguistic_features/iterating_lefts_and_rights.rb +1 -1
data/examples/linguistic_features/similarity.rb +2 -2
data/examples/linguistic_features/similarity_between_spans.rb +2 -2
data/lib/ruby-spacy.rb +331 -325
data/lib/ruby-spacy/version.rb +1 -1
metadata +4 -4
data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9add9d3b065bbf5064652cb115f824221d929a20478d182782df5db564cc8f45
-  data.tar.gz: f07d502f79883a452e7f250f0fe784425511a0de4f8a43db0b29ca03801bd755
+  metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
+  data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
 SHA512:
-  metadata.gz: 373c795a148034f4191cfaf130a23f464dc2b43927bf6aa3165999c78797365ce2f976021ea8b9ab1dd083736e5f9a1da51a5ccf0156d00ec39dac9fd19bde7c
-  data.tar.gz: e370e503c23d15a0a44be84bf578775b0a4acc5557468c7fc9468cde44e0e084018be8dc17c3e7c21d9efdaf229611ca234614fcd2e811272051c7c2922b408d
+  metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
+  data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,14 @@
+# Change Log
+## 0.1.3 - 2021-06-26
+- Code cleanup
+## 0.1.2 - 2021-06-26
+### Added
+- `Spacy::Token#morpheme` method
+## 0.1.1 - 2021-06-26
+- Project description fixed
+## 0.1.0 - 2021-06-26
+- Initial release

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ruby-spacy (0.1.2)
+    ruby-spacy (0.1.3)
       numpy (~> 0.4.0)
       pycall (~> 1.4.0)
       terminal-table (~> 3.0.1)

data/examples/get_started/pos_tags_and_dependencies.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require "ruby-spacy"
 require "terminal-table"
 nlp = Spacy::Language.new("en_core_web_sm")
-doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
+doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion.")
 headings = ["text", "lemma", "pos", "tag", "dep"]
 rows = []

data/examples/get_started/similarity.rb CHANGED Viewed

@@ -4,8 +4,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
 doc1 = nlp.read("I like salty fries and hamburgers.")
 doc2 = nlp.read("Fast food tastes very good.")
-puts "Doc 1: " + doc1
-puts "Doc 2: " + doc2
+puts "Doc 1: " + doc1.text
+puts "Doc 2: " + doc2.text
 puts "Similarity: #{doc1.similarity(doc2)}"
 # Doc 1: I like salty fries and hamburgers.

data/examples/japanese/visualizing_dependencies.rb CHANGED Viewed

@@ -6,8 +6,8 @@ nlp = Spacy::Language.new("ja_core_news_sm")
 sentence = "自動運転車は保険責任を製造者に転嫁する。"
 doc = nlp.read(sentence)
-dep_svg = doc.displacy('dep', false)
+dep_svg = doc.displacy(style: 'dep', compact: false)
-File.open(File.join(File.dirname(__FILE__), "outputs/test_dep.svg"), "w") do |file|
+File.open(File.join(File.dirname(__FILE__), "test_dep.svg"), "w") do |file|
   file.write(dep_svg)
 end

data/examples/japanese/visualizing_named_entities.rb CHANGED Viewed

@@ -7,7 +7,7 @@ sentence ="セバスチアン・スランが2007年にグーグルで自動運
 doc = nlp.read(sentence)
-ent_html = doc.displacy('ent')
+ent_html = doc.displacy(style: 'ent')
 File.open(File.join(File.dirname(__FILE__), "outputs/test_ent.html"), "w") do |file|
   file.write(ent_html)

data/examples/linguistic_features/iterating_lefts_and_rights.rb CHANGED Viewed

@@ -5,7 +5,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
 doc = nlp.read("bright red apples on the tree")
-puts "Text: " + doc
+puts "Text: " + doc.text
 puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
 puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s

data/examples/linguistic_features/similarity.rb CHANGED Viewed

@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
 doc1 = nlp.read("I like salty fries and hamburgers.")
 doc2 = nlp.read("Fast food tastes very good.")
-puts "Doc 1: " + doc1
-puts "Doc 2: " + doc2
+puts "Doc 1: " + doc1.text
+puts "Doc 2: " + doc2.text
 puts "Similarity: #{doc1.similarity(doc2)}"
 # Doc 1: I like salty fries and hamburgers.

data/examples/linguistic_features/similarity_between_spans.rb CHANGED Viewed

@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
 doc1 = nlp.read("I like salty fries and hamburgers.")
 doc2 = nlp.read("Fast food tastes very good.")
-puts "Doc 1: " + doc1
-puts "Doc 2: " + doc2
+puts "Doc 1: " + doc1.text
+puts "Doc 2: " + doc2.text
 puts "Similarity: #{doc1.similarity(doc2)}"
 span1 = doc1.span(2, 2) # salty fries

data/lib/ruby-spacy.rb CHANGED Viewed

@@ -3,12 +3,34 @@
 require_relative "ruby-spacy/version"
 require 'enumerator'
 require 'strscan'
-require 'pycall/import'
 require 'numpy'
+require 'pycall/import'
 include PyCall::Import
 # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
 module Spacy
+  extend PyCall::Import
+  spacy = PyCall.import_module('spacy')
+  # Python `Language` class
+  PyLanguage = spacy.language.Language
+  # Python `Doc` class object
+  PyDoc = spacy.tokens.Doc
+  # Python `Span` class object
+  PySpan = spacy.tokens.Span
+  # Python `Token` class object
+  PyToken = spacy.tokens.Token
+  # Python `Matcher` class object
+  PyMatcher = spacy.matcher.Matcher
+  # Python `displacy` object
+  PyDisplacy = spacy.displacy
   # A utility module method to convert Python's generator object to a Ruby array,
   # mainly used on the items inside the array returned from dependency-related methods
   # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
@@ -16,12 +38,303 @@ module Spacy
     PyCall::List.(py_generator)
   end
+  # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
+  class Doc
+    # @return [Object] a Python `Language` instance accessible via `PyCall`
+    attr_reader :py_nlp
+    # @return [Object] a Python `Doc` instance accessible via `PyCall`
+    attr_reader :py_doc
+    # @return [String] a text string of the document
+    attr_reader :text
+    include Enumerable
+    alias_method :length, :count
+    alias_method :len, :count
+    alias_method :size, :count
+    # It is recommended to use {Language#read} method to create a doc. If you need to
+    # create one using {Doc#initialize}, there are two method signatures:
+    # `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
+    # @param nlp [Language] an instance of {Language} class
+    # @param py_doc [Object] an instance of Python `Doc` class
+    # @param text [String] the text string to be analyzed
+    def initialize(nlp, py_doc: nil, text: nil)
+      @py_nlp = nlp
+      if py_doc
+        @py_doc = py_doc
+      else
+        @py_doc = nlp.(text)
+      end
+      @text = @py_doc.text
+    end
+    # Retokenizes the text merging a span into a single token.
+    # @param start_index [Integer] the start position of the span to be retokenized in the document
+    # @param end_index [Integer] the end position of the span to be retokenized in the document
+    # @param attributes [Hash] attributes to set on the merged token
+    def retokenize(start_index, end_index, attributes = {})
+      PyCall.with(@py_doc.retokenize()) do |retokenizer|
+        retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
+      end
+    end
+    # Retokenizes the text splitting the specified token.
+    # @param pos_in_doc [Integer] the position of the span to be retokenized in the document
+    # @param split_array [Array<String>] text strings of the split results
+    # @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
+    # @param attributes [Hash] the attributes of the split elements
+    def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
+      PyCall.with(@py_doc.retokenize()) do |retokenizer|
+        heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
+        retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
+      end
+    end
+    # String representation of the document.
+    # @return [String]
+    def to_s
+      @text
+    end
+    # Returns an array of tokens contained in the doc.
+    # @return [Array<Token>]
+    def tokens
+      results = []
+      PyCall::List.(@py_doc).each do |py_token|
+        results << Token.new(py_token)
+      end
+      results
+    end
+    # Iterates over the elements in the doc yielding a token instance each time.
+    def each
+      PyCall::List.(@py_doc).each do |py_token|
+        yield Token.new(py_token)
+      end
+    end
+    # Returns a span of the specified range within the doc.
+    # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
+    # @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
+    # @param optional_size [Integer] an integer representing the size of the span
+    # @return [Span]
+    def span(range_or_start, optional_size = nil)
+      if optional_size
+        start_index = range_or_start
+        temp = tokens[start_index ... start_index + optional_size]
+      else
+        start_index = range_or_start.first
+        range = range_or_start
+        temp = tokens[range]
+      end
+      end_index = start_index + temp.size - 1
+      Span.new(self, start_index: start_index, end_index: end_index)
+    end
+    # Returns an array of spans representing noun chunks.
+    # @return [Array<Span>]
+    def noun_chunks
+      chunk_array = []
+      py_chunks = PyCall::List.(@py_doc.noun_chunks)
+      py_chunks.each do |py_chunk|
+        chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
+      end
+      chunk_array
+    end
+    # Returns an array of spans each representing a sentence.
+    # @return [Array<Span>]
+    def sents
+      sentence_array = []
+      py_sentences = PyCall::List.(@py_doc.sents)
+      py_sentences.each do |py_sent|
+        sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
+      end
+      sentence_array
+    end
+    # Returns an array of spans each representing a named entity.
+    # @return [Array<Span>]
+    def ents
+      # so that ents canbe "each"-ed in Ruby
+      ent_array = []
+      PyCall::List.(@py_doc.ents).each do |ent|
+        ent_array << ent
+      end
+      ent_array
+    end
+    # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
+    # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
+    def [](range)
+      if range.is_a?(Range)
+        py_span = @py_doc[range]
+        return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
+      else
+        return Token.new(@py_doc[range])
+      end
+    end
+    # Returns a semantic similarity estimate.
+    # @param other [Doc] the other doc to which a similarity estimation is made
+    # @return [Float]
+    def similarity(other)
+      py_doc.similarity(other.py_doc)
+    end
+    # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
+    # @param style [String] either `dep` or `ent`
+    # @param compact [Boolean] only relevant to the `dep' style
+    # @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
+    def displacy(style: "dep", compact: false)
+      PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
+    end
+    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
+    def method_missing(name, *args)
+      @py_doc.send(name, *args)
+    end
+  end
+  # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
+  class Language
+    # @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
+    attr_reader :spacy_nlp_id
+    # @return [Object] a Python `Language` instance accessible via `PyCall`
+    attr_reader :py_nlp
+    # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
+    # @param model [String] A language model installed in the system
+    def initialize(model = "en_core_web_sm")
+      @spacy_nlp_id = "nlp_#{model.object_id}"
+      PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
+      @py_nlp = PyCall.eval(@spacy_nlp_id)
+    end
+    # Reads and analyze the given text.
+    # @param text [String] a text to be read and analyzed
+    def read(text)
+      Doc.new(py_nlp, text: text)
+    end
+    # Generates a matcher for the current language model.
+    # @return [Matcher]
+    def matcher
+      Matcher.new(@py_nlp)
+    end
+    # A utility method to lookup a vocabulary item of the given id.
+    # @param id [Integer] a vocabulary id
+    # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
+    def vocab_string_lookup(id)
+      PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
+    end
+    # A utility method to list pipeline components.
+    # @return [Array<String>] An array of text strings representing pipeline components
+    def pipe_names
+      pipe_array = []
+      PyCall::List.(@py_nlp.pipe_names).each do |pipe|
+        pipe_array << pipe
+      end
+      pipe_array
+    end
+    # A utility method to get a Python `Lexeme` object.
+    # @param text [String] A text string representing a lexeme
+    # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
+    def get_lexeme(text)
+      text = text.gsub("'", "\'")
+      @py_nlp.vocab[text]
+    end
+    # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
+    # @param vector [Object] A vector representation of a word (whether existing or non-existing)
+    # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
+    def most_similar(vector, n)
+      vec_array = Numpy.asarray([vector])
+      py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
+      key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
+      keys = key_texts.map{|kt| kt[0]}
+      texts = key_texts.map{|kt| kt[1]}
+      best_rows = PyCall::List.(py_result[1])[0]
+      scores = PyCall::List.(py_result[2])[0]
+      results = []
+      n.times do |i|
+        results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
+      end
+      results
+    end
+    # Utility function to batch process many texts
+    # @param texts [String]
+    # @param disable [Array<String>]
+    # @param batch_size [Integer]
+    # @return [Array<Doc>]
+    def pipe(texts, disable: [], batch_size: 50)
+      docs = []
+      PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
+        docs << Doc.new(@py_nlp, py_doc: py_doc)
+      end
+      docs
+    end
+    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
+    def method_missing(name, *args)
+      @py_nlp.send(name, *args)
+    end
+  end
+  # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
+  class Matcher
+    # @return [Object] a Python `Matcher` instance accessible via `PyCall`
+    attr_reader :py_matcher
+    # Creates a {Matcher} instance
+    # @param nlp [Language] an instance of {Language} class
+    def initialize(nlp)
+      @py_matcher = PyMatcher.(nlp.vocab)
+    end
+    # Adds a label string and a text pattern.
+    # @param text [String] a label string given to the pattern
+    # @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
+    def add(text, pattern)
+      @py_matcher.add(text, pattern)
+    end
+    # Execute the match.
+    # @param doc [Doc] an {Doc} instance
+    # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
+    def match(doc)
+      str_results = @py_matcher.(doc.py_doc).to_s
+      s = StringScanner.new(str_results[1..-2])
+      results = []
+      while s.scan_until(/(\d+), (\d+), (\d+)/)
+        next unless s.matched
+        triple = s.matched.split(", ")
+        match_id = triple[0].to_i
+        start_index = triple[1].to_i
+        end_index = triple[2].to_i - 1
+        results << {match_id: match_id, start_index: start_index, end_index: end_index}
+      end
+      results
+    end
+  end
   # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
   class Span
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_span_id
     # @return [Object] a Python `Span` instance accessible via `PyCall`
     attr_reader :py_span
@@ -35,21 +348,18 @@ module Spacy
     alias_method :size, :count
     # It is recommended to use {Doc#span} method to create a span. If you need to
-    # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
+    # create one using {Span#initialize}, there are two method signatures:
+    # `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
     # @param doc [Doc] the document to which this span belongs to
     # @param start_index [Integer] the index of the item starting the span inside a doc
     # @param end_index [Integer] the index of the item ending the span inside a doc
     # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
     def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
       @doc = doc
-      @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
       if py_span
         @py_span = py_span
       else
-        options = PyCall::Dict.(options)
-        PyCall.exec("#{@spacy_span_id}_opts = #{options}")
-        PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
-        @py_span = PyCall.eval(@spacy_span_id)
+        @py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
       end
     end
@@ -63,7 +373,7 @@ module Spacy
       results
     end
-    # Iterates over the elements in the span yielding a token instance.
+    # Iterates over the elements in the span yielding a token instance each time.
     def each
       PyCall::List.(@py_span).each do |py_token|
         yield Token.new(py_token)
@@ -97,7 +407,6 @@ module Spacy
     def ents
       ent_array = []
       PyCall::List.(@py_span.ents).each do |py_span|
-        # ent_array << ent
         ent_array << Spacy::Span.new(@doc, py_span: py_span)
       end
       ent_array
@@ -106,11 +415,11 @@ module Spacy
     # Returns a span that represents the sentence that the given span is part of.
     # @return [Span]
     def sent
-      py_span =@py_span.sent
+      py_span = @py_span.sent
       return Spacy::Span.new(@doc, py_span: py_span)
     end
-    # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
+    # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
     # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
     def [](range)
       if range.is_a?(Range)
@@ -125,16 +434,16 @@ module Spacy
     # @param other [Span] the other span to which a similarity estimation is conducted
     # @return [Float]
     def similarity(other)
-      PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
+      py_span.similarity(other.py_span)
     end
-    # Creates a document instance
+    # Creates a document instance from the span
     # @return [Doc]
     def as_doc
-      Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
+      Spacy::Doc.new(@doc.py_nlp, text: self.text)
     end
-    # Returns Tokens conjugated to the root of the span.
+    # Returns tokens conjugated to the root of the span.
     # @return [Array<Token>] an array of tokens
     def conjuncts
       conjunct_array = []
@@ -144,7 +453,7 @@ module Spacy
       conjunct_array
     end
-    # Returns Tokens that are to the left of the span, whose heads are within the span.
+    # Returns tokens that are to the left of the span, whose heads are within the span.
     # @return [Array<Token>] an array of tokens
     def lefts
       left_array = []
@@ -189,7 +498,8 @@ module Spacy
     # @return [String] a string representing the token
     attr_reader :text
-    # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
+    # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
+    # There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
     # @param py_token [Object] Python `Token` object
     def initialize(py_token)
       @py_token = py_token
@@ -253,7 +563,7 @@ module Spacy
     end
     # Returns a hash or string of morphological information
-    # @param dict [Boolean] if true, a hash will be returned instead of a string
+    # @param hash [Boolean] if true, a hash will be returned instead of a string
     # @return [Hash, String]
     def morphology(hash = true)
       if @py_token.has_morph
@@ -278,310 +588,6 @@ module Spacy
     end
   end
-  # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
-  class Doc
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_nlp_id
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_doc_id
-    # @return [Object] a Python `Doc` instance accessible via `PyCall`
-    attr_reader :py_doc
-    # @return [String] a text string of the document
-    attr_reader :text
-    include Enumerable
-    alias_method :length, :count
-    alias_method :len, :count
-    alias_method :size, :count
-    # Creates a new instance of {Doc}.
-    # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
-    # @param text [String] The text string to be analyzed
-    def initialize(nlp_id, text)
-      @text = text
-      @spacy_nlp_id = nlp_id
-      @spacy_doc_id = "doc_#{text.object_id}"
-      quoted = text.gsub('"', '\"')
-      PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
-      PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
-      @py_doc = PyCall.eval(@spacy_doc_id)
-    end
-    # Retokenizes the text merging a span into a single token.
-    # @param start_index [Integer] The start position of the span to be retokenized in the document
-    # @param end_index [Integer] The end position of the span to be retokenized in the document
-    # @param attributes [Hash] Attributes to set on the merged token
-    def retokenize(start_index, end_index, attributes = {})
-      py_attrs = PyCall::Dict.(attributes)
-      PyCall.exec(<<PY)
-with #{@spacy_doc_id}.retokenize() as retokenizer:
-    retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
-PY
-      @py_doc = PyCall.eval(@spacy_doc_id)
-    end
-    # Retokenizes the text splitting the specified token.
-    # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
-    # @param split_array [Array<String>] text strings of the split results
-    # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
-    # @param attributes [Hash] The attributes of the split elements
-    def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
-      py_attrs = PyCall::Dict.(attributes)
-      py_split_array = PyCall::List.(split_array)
-      PyCall.exec(<<PY)
-with #{@spacy_doc_id}.retokenize() as retokenizer:
-    heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
-    attrs = #{py_attrs}
-    split_array = #{py_split_array}
-    retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
-PY
-      @py_doc = PyCall.eval(@spacy_doc_id)
-    end
-    # String representation of the token.
-    # @return [String]
-    def to_s
-      @text
-    end
-    # Returns an array of tokens contained in the doc.
-    # @return [Array<Token>]
-    def tokens
-      results = []
-      PyCall::List.(@py_doc).each do |py_token|
-        results << Token.new(py_token)
-      end
-      results
-    end
-    # Iterates over the elements in the doc yielding a token instance.
-    def each
-      PyCall::List.(@py_doc).each do |py_token|
-        yield Token.new(py_token)
-      end
-    end
-    # Returns a span of the specified range within the doc.
-    # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
-    # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
-    # @param optional_size [Integer] An integer representing the size of the span
-    # @return [Span]
-    def span(range_or_start, optional_size = nil)
-      if optional_size
-        start_index = range_or_start
-        temp = tokens[start_index ... start_index + optional_size]
-      else
-        start_index = range_or_start.first
-        range = range_or_start
-        temp = tokens[range]
-      end
-      end_index = start_index + temp.size - 1
-      Span.new(self, start_index: start_index, end_index: end_index)
-    end
-    # Returns an array of spans representing noun chunks.
-    # @return [Array<Span>]
-    def noun_chunks
-      chunk_array = []
-      py_chunks = PyCall::List.(@py_doc.noun_chunks)
-      py_chunks.each do |py_chunk|
-        chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
-      end
-      chunk_array
-    end
-    # Returns an array of spans representing sentences.
-    # @return [Array<Span>]
-    def sents
-      sentence_array = []
-      py_sentences = PyCall::List.(@py_doc.sents)
-      py_sentences.each do |py_sent|
-        sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
-      end
-      sentence_array
-    end
-    # Returns an array of spans representing named entities.
-    # @return [Array<Span>]
-    def ents
-      # so that ents canbe "each"-ed in Ruby
-      ent_array = []
-      PyCall::List.(@py_doc.ents).each do |ent|
-        ent_array << ent
-      end
-      ent_array
-    end
-    # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
-    # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
-    def [](range)
-      if range.is_a?(Range)
-        py_span = @py_doc[range]
-        return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
-      else
-        return Token.new(@py_doc[range])
-      end
-    end
-    # Returns a semantic similarity estimate.
-    # @param other [Doc] the other doc to which a similarity estimation is made
-    # @return [Float]
-    def similarity(other)
-      PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
-    end
-    # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
-    # @param style [String] Either `dep` or `ent`
-    # @param compact [Boolean] Only relevant to the `dep' style
-    # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
-    def displacy(style: "dep", compact: false)
-      PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
-    end
-    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
-    def method_missing(name, *args)
-      @py_doc.send(name, *args)
-    end
-  end
-  # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
-  class Matcher
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_matcher_id
-    # @return [Object] a Python `Matcher` instance accessible via `PyCall`
-    attr_reader :py_matcher
-    # Creates a {Matcher} instance
-    # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
-    def initialize(nlp_id)
-      @spacy_matcher_id = "doc_#{nlp_id}_matcher"
-      PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
-      @py_matcher = PyCall.eval(@spacy_matcher_id)
-    end
-    # Adds a label string and a text pattern.
-    # @param text [String] a label string given to the pattern
-    # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
-    def add(text, pattern)
-      @py_matcher.add(text, pattern)
-    end
-    # Execute the match.
-    # @param doc [Doc] An {Doc} instance
-    # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
-    def match(doc)
-      str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
-      s = StringScanner.new(str_results[1..-2])
-      results = []
-      while s.scan_until(/(\d+), (\d+), (\d+)/)
-        next unless s.matched
-        triple = s.matched.split(", ")
-        match_id = triple[0].to_i
-        start_index = triple[1].to_i
-        end_index = triple[2].to_i - 1
-        results << {match_id: match_id, start_index: start_index, end_index: end_index}
-      end
-      results
-    end
-  end
-  # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
-  class Language
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_nlp_id
-    # @return [Object] a Python `Language` instance accessible via `PyCall`
-    attr_reader :py_nlp
-    # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
-    # @param model [String] A language model installed in the system
-    def initialize(model = "en_core_web_sm")
-      @spacy_nlp_id = "nlp_#{model.object_id}"
-      PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
-      PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
-      @py_nlp = PyCall.eval(@spacy_nlp_id)
-    end
-    # Reads and analyze the given text.
-    # @param text [String] A text to be read and analyzed
-    def read(text)
-      Doc.new(@spacy_nlp_id, text)
-    end
-    # Generates a matcher for the current language model.
-    # @return [Matcher]
-    def matcher
-      Matcher.new(@spacy_nlp_id)
-    end
-    # A utility method to lookup a vocabulary item of the given id.
-    # @param id [Integer] A vocabulary id
-    # @return [Object] A Python `Lexeme` object
-    def vocab_string_lookup(id)
-      PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
-    end
-    # A utility method to list pipeline components.
-    # @return [Array<String>] An array of text strings representing pipeline components
-    def pipe_names
-      pipe_array = []
-      PyCall::List.(@py_nlp.pipe_names).each do |pipe|
-        pipe_array << pipe
-      end
-      pipe_array
-    end
-    # A utility method to get the tokenizer Python object.
-    # @return [Object] Python `Tokenizer` object
-    def tokenizer
-      return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
-    end
-    # A utility method to get a Python `Lexeme` object.
-    # @param text [String] A text string representing a lexeme
-    # @return [Object] Python `Tokenizer` object
-    def get_lexeme(text)
-      text = text.gsub("'", "\'")
-      py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
-      return py_lexeme
-    end
-    # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
-    # @param vector [Object] A vector representation of a word (whether existing or non-existing)
-    # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
-    def most_similar(vector, n)
-      vec_array = Numpy.asarray([vector])
-      py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
-      key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
-      keys = key_texts.map{|kt| kt[0]}
-      texts = key_texts.map{|kt| kt[1]}
-      best_rows = PyCall::List.(py_result[1])[0]
-      scores = PyCall::List.(py_result[2])[0]
-      results = []
-      n.times do |i|
-        results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
-      end
-      results
-    end
-    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
-    def method_missing(name, *args)
-      @py_nlp.send(name, *args)
-    end
-  end
 end

data/lib/ruby-spacy/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 module Spacy
   # The version number of the module
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ruby-spacy
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-06-26 00:00:00.000000000 Z
+date: 2021-06-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pycall
@@ -66,6 +66,7 @@ extra_rdoc_files: []
 files:
 - ".gitignore"
 - ".yardopts"
+- CHANGELOG.md
 - Gemfile
 - Gemfile.lock
 - LICENSE.txt
@@ -123,7 +124,6 @@ files:
 - examples/linguistic_features/sentence_segmentation.rb
 - examples/linguistic_features/similarity.rb
 - examples/linguistic_features/similarity_between_spans.rb
-- examples/linguistic_features/special_case_tokenization_rules.rb
 - examples/linguistic_features/tokenization.rb
 - examples/rule_based_matching/creating_spans_from_matches.rb
 - examples/rule_based_matching/matcher.rb
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.11
+rubygems_version: 3.2.3
 signing_key:
 specification_version: 4
 summary: A wrapper module for using spaCy natural language processing library from

data/examples/linguistic_features/special_case_tokenization_rules.rb DELETED Viewed

@@ -1,19 +0,0 @@
-require "ruby-spacy"
-require "terminal-table"
-nlp = Spacy::Language.new("en_core_web_sm")
-doc = nlp.read("gimme that")
-puts doc.tokens.join(" ")
-# Add special case rule
-special_case = [{ORTH: "gim"}, {ORTH: "me"}]
-tokenizer = nlp.tokenizer
-tokenizer.add_special_case("gimme", special_case)
-# Check new tokenization
-puts nlp.read("gimme that").tokens.join(" ")
-# gimme that
-# gim me that