RubyGems - ruby-spacy - Versions diffs - 0.1.2 → 0.1.3 - Mend

ruby-spacy 0.1.2 → 0.1.3

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +14 -0
data/Gemfile.lock +1 -1
data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
data/examples/get_started/similarity.rb +2 -2
data/examples/japanese/visualizing_dependencies.rb +2 -2
data/examples/japanese/visualizing_named_entities.rb +1 -1
data/examples/linguistic_features/iterating_lefts_and_rights.rb +1 -1
data/examples/linguistic_features/similarity.rb +2 -2
data/examples/linguistic_features/similarity_between_spans.rb +2 -2
data/lib/ruby-spacy.rb +331 -325
data/lib/ruby-spacy/version.rb +1 -1
metadata +4 -4
data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9add9d3b065bbf5064652cb115f824221d929a20478d182782df5db564cc8f45
-  data.tar.gz: f07d502f79883a452e7f250f0fe784425511a0de4f8a43db0b29ca03801bd755
+  metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
+  data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
 SHA512:
-  metadata.gz: 373c795a148034f4191cfaf130a23f464dc2b43927bf6aa3165999c78797365ce2f976021ea8b9ab1dd083736e5f9a1da51a5ccf0156d00ec39dac9fd19bde7c
-  data.tar.gz: e370e503c23d15a0a44be84bf578775b0a4acc5557468c7fc9468cde44e0e084018be8dc17c3e7c21d9efdaf229611ca234614fcd2e811272051c7c2922b408d
+  metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
+  data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,14 @@
+# Change Log
+## 0.1.3 - 2021-06-26
+- Code cleanup
+## 0.1.2 - 2021-06-26
+### Added
+- `Spacy::Token#morpheme` method
+## 0.1.1 - 2021-06-26
+- Project description fixed
+## 0.1.0 - 2021-06-26
+- Initial release

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    ruby-spacy (0.1.2)
+    ruby-spacy (0.1.3)
       numpy (~> 0.4.0)
       pycall (~> 1.4.0)
       terminal-table (~> 3.0.1)

data/examples/get_started/pos_tags_and_dependencies.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require "ruby-spacy"
 require "terminal-table"
 nlp = Spacy::Language.new("en_core_web_sm")
-doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
+doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion.")
 headings = ["text", "lemma", "pos", "tag", "dep"]
 rows = []

data/examples/get_started/similarity.rb CHANGED Viewed

@@ -4,8 +4,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
 doc1 = nlp.read("I like salty fries and hamburgers.")
 doc2 = nlp.read("Fast food tastes very good.")
-puts "Doc 1: " + doc1
-puts "Doc 2: " + doc2
+puts "Doc 1: " + doc1.text
+puts "Doc 2: " + doc2.text
 puts "Similarity: #{doc1.similarity(doc2)}"
 # Doc 1: I like salty fries and hamburgers.

data/examples/japanese/visualizing_dependencies.rb CHANGED Viewed

@@ -6,8 +6,8 @@ nlp = Spacy::Language.new("ja_core_news_sm")
 sentence = "自動運転車は保険責任を製造者に転嫁する。"
 doc = nlp.read(sentence)
-dep_svg = doc.displacy('dep', false)
+dep_svg = doc.displacy(style: 'dep', compact: false)
-File.open(File.join(File.dirname(__FILE__), "outputs/test_dep.svg"), "w") do |file|
+File.open(File.join(File.dirname(__FILE__), "test_dep.svg"), "w") do |file|
   file.write(dep_svg)
 end

data/examples/japanese/visualizing_named_entities.rb CHANGED Viewed

@@ -7,7 +7,7 @@ sentence ="セバスチアン・スランが2007年にグーグルで自動運
 doc = nlp.read(sentence)
-ent_html = doc.displacy('ent')
+ent_html = doc.displacy(style: 'ent')
 File.open(File.join(File.dirname(__FILE__), "outputs/test_ent.html"), "w") do |file|
   file.write(ent_html)

data/examples/linguistic_features/iterating_lefts_and_rights.rb CHANGED Viewed

@@ -5,7 +5,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
 doc = nlp.read("bright red apples on the tree")
-puts "Text: " + doc
+puts "Text: " + doc.text
 puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
 puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s

data/examples/linguistic_features/similarity.rb CHANGED Viewed

@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
 doc1 = nlp.read("I like salty fries and hamburgers.")
 doc2 = nlp.read("Fast food tastes very good.")
-puts "Doc 1: " + doc1
-puts "Doc 2: " + doc2
+puts "Doc 1: " + doc1.text
+puts "Doc 2: " + doc2.text
 puts "Similarity: #{doc1.similarity(doc2)}"
 # Doc 1: I like salty fries and hamburgers.

data/examples/linguistic_features/similarity_between_spans.rb CHANGED Viewed

@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
 doc1 = nlp.read("I like salty fries and hamburgers.")
 doc2 = nlp.read("Fast food tastes very good.")
-puts "Doc 1: " + doc1
-puts "Doc 2: " + doc2
+puts "Doc 1: " + doc1.text
+puts "Doc 2: " + doc2.text
 puts "Similarity: #{doc1.similarity(doc2)}"
 span1 = doc1.span(2, 2) # salty fries

data/lib/ruby-spacy.rb CHANGED Viewed

@@ -3,12 +3,34 @@
 require_relative "ruby-spacy/version"
 require 'enumerator'
 require 'strscan'
-require 'pycall/import'
 require 'numpy'
+require 'pycall/import'
 include PyCall::Import
 # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
 module Spacy
+  extend PyCall::Import
+  spacy = PyCall.import_module('spacy')
+  # Python `Language` class
+  PyLanguage = spacy.language.Language
+  # Python `Doc` class object
+  PyDoc = spacy.tokens.Doc
+  # Python `Span` class object
+  PySpan = spacy.tokens.Span
+  # Python `Token` class object
+  PyToken = spacy.tokens.Token
+  # Python `Matcher` class object
+  PyMatcher = spacy.matcher.Matcher
+  # Python `displacy` object
+  PyDisplacy = spacy.displacy
   # A utility module method to convert Python's generator object to a Ruby array,
   # mainly used on the items inside the array returned from dependency-related methods
   # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
@@ -16,12 +38,303 @@ module Spacy
     PyCall::List.(py_generator)
   end
+  # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
+  class Doc
+    # @return [Object] a Python `Language` instance accessible via `PyCall`
+    attr_reader :py_nlp
+    # @return [Object] a Python `Doc` instance accessible via `PyCall`
+    attr_reader :py_doc
+    # @return [String] a text string of the document
+    attr_reader :text
+    include Enumerable
+    alias_method :length, :count
+    alias_method :len, :count
+    alias_method :size, :count
+    # It is recommended to use {Language#read} method to create a doc. If you need to
+    # create one using {Doc#initialize}, there are two method signatures:
+    # `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
+    # @param nlp [Language] an instance of {Language} class
+    # @param py_doc [Object] an instance of Python `Doc` class
+    # @param text [String] the text string to be analyzed
+    def initialize(nlp, py_doc: nil, text: nil)
+      @py_nlp = nlp
+      if py_doc
+        @py_doc = py_doc
+      else
+        @py_doc = nlp.(text)
+      end
+      @text = @py_doc.text
+    end
+    # Retokenizes the text merging a span into a single token.
+    # @param start_index [Integer] the start position of the span to be retokenized in the document
+    # @param end_index [Integer] the end position of the span to be retokenized in the document
+    # @param attributes [Hash] attributes to set on the merged token
+    def retokenize(start_index, end_index, attributes = {})
+      PyCall.with(@py_doc.retokenize()) do |retokenizer|
+        retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
+      end
+    end
+    # Retokenizes the text splitting the specified token.
+    # @param pos_in_doc [Integer] the position of the span to be retokenized in the document
+    # @param split_array [Array<String>] text strings of the split results
+    # @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
+    # @param attributes [Hash] the attributes of the split elements
+    def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
+      PyCall.with(@py_doc.retokenize()) do |retokenizer|
+        heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
+        retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
+      end
+    end
+    # String representation of the document.
+    # @return [String]
+    def to_s
+      @text
+    end
+    # Returns an array of tokens contained in the doc.
+    # @return [Array<Token>]
+    def tokens
+      results = []
+      PyCall::List.(@py_doc).each do |py_token|
+        results << Token.new(py_token)
+      end
+      results
+    end
+    # Iterates over the elements in the doc yielding a token instance each time.
+    def each
+      PyCall::List.(@py_doc).each do |py_token|
+        yield Token.new(py_token)
+      end
+    end
+    # Returns a span of the specified range within the doc.
+    # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
+    # @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
+    # @param optional_size [Integer] an integer representing the size of the span
+    # @return [Span]
+    def span(range_or_start, optional_size = nil)
+      if optional_size
+        start_index = range_or_start
+        temp = tokens[start_index ... start_index + optional_size]
+      else
+        start_index = range_or_start.first
+        range = range_or_start
+        temp = tokens[range]
+      end
+      end_index = start_index + temp.size - 1
+      Span.new(self, start_index: start_index, end_index: end_index)
+    end
+    # Returns an array of spans representing noun chunks.
+    # @return [Array<Span>]
+    def noun_chunks
+      chunk_array = []
+      py_chunks = PyCall::List.(@py_doc.noun_chunks)
+      py_chunks.each do |py_chunk|
+        chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
+      end
+      chunk_array
+    end
+    # Returns an array of spans each representing a sentence.
+    # @return [Array<Span>]
+    def sents
+      sentence_array = []
+      py_sentences = PyCall::List.(@py_doc.sents)
+      py_sentences.each do |py_sent|
+        sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
+      end
+      sentence_array
+    end
+    # Returns an array of spans each representing a named entity.
+    # @return [Array<Span>]
+    def ents
+      # so that ents canbe "each"-ed in Ruby
+      ent_array = []
+      PyCall::List.(@py_doc.ents).each do |ent|
+        ent_array << ent
+      end
+      ent_array
+    end
+    # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
+    # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
+    def [](range)
+      if range.is_a?(Range)
+        py_span = @py_doc[range]
+        return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
+      else
+        return Token.new(@py_doc[range])
+      end
+    end
+    # Returns a semantic similarity estimate.
+    # @param other [Doc] the other doc to which a similarity estimation is made
+    # @return [Float]
+    def similarity(other)
+      py_doc.similarity(other.py_doc)
+    end
+    # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
+    # @param style [String] either `dep` or `ent`
+    # @param compact [Boolean] only relevant to the `dep' style
+    # @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
+    def displacy(style: "dep", compact: false)
+      PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
+    end
+    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
+    def method_missing(name, *args)
+      @py_doc.send(name, *args)
+    end
+  end
+  # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
+  class Language
+    # @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
+    attr_reader :spacy_nlp_id
+    # @return [Object] a Python `Language` instance accessible via `PyCall`
+    attr_reader :py_nlp
+    # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
+    # @param model [String] A language model installed in the system
+    def initialize(model = "en_core_web_sm")
+      @spacy_nlp_id = "nlp_#{model.object_id}"
+      PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
+      @py_nlp = PyCall.eval(@spacy_nlp_id)
+    end
+    # Reads and analyze the given text.
+    # @param text [String] a text to be read and analyzed
+    def read(text)
+      Doc.new(py_nlp, text: text)
+    end
+    # Generates a matcher for the current language model.
+    # @return [Matcher]
+    def matcher
+      Matcher.new(@py_nlp)
+    end
+    # A utility method to lookup a vocabulary item of the given id.
+    # @param id [Integer] a vocabulary id
+    # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
+    def vocab_string_lookup(id)
+      PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
+    end
+    # A utility method to list pipeline components.
+    # @return [Array<String>] An array of text strings representing pipeline components
+    def pipe_names
+      pipe_array = []
+      PyCall::List.(@py_nlp.pipe_names).each do |pipe|
+        pipe_array << pipe
+      end
+      pipe_array
+    end
+    # A utility method to get a Python `Lexeme` object.
+    # @param text [String] A text string representing a lexeme
+    # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
+    def get_lexeme(text)
+      text = text.gsub("'", "\'")
+      @py_nlp.vocab[text]
+    end
+    # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
+    # @param vector [Object] A vector representation of a word (whether existing or non-existing)
+    # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
+    def most_similar(vector, n)
+      vec_array = Numpy.asarray([vector])
+      py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
+      key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
+      keys = key_texts.map{|kt| kt[0]}
+      texts = key_texts.map{|kt| kt[1]}
+      best_rows = PyCall::List.(py_result[1])[0]
+      scores = PyCall::List.(py_result[2])[0]
+      results = []
+      n.times do |i|
+        results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
+      end
+      results
+    end
+    # Utility function to batch process many texts
+    # @param texts [String]
+    # @param disable [Array<String>]
+    # @param batch_size [Integer]
+    # @return [Array<Doc>]
+    def pipe(texts, disable: [], batch_size: 50)
+      docs = []
+      PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
+        docs << Doc.new(@py_nlp, py_doc: py_doc)
+      end
+      docs
+    end
+    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
+    def method_missing(name, *args)
+      @py_nlp.send(name, *args)
+    end
+  end
+  # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
+  class Matcher
+    # @return [Object] a Python `Matcher` instance accessible via `PyCall`
+    attr_reader :py_matcher
+    # Creates a {Matcher} instance
+    # @param nlp [Language] an instance of {Language} class
+    def initialize(nlp)
+      @py_matcher = PyMatcher.(nlp.vocab)
+    end
+    # Adds a label string and a text pattern.
+    # @param text [String] a label string given to the pattern
+    # @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
+    def add(text, pattern)
+      @py_matcher.add(text, pattern)
+    end
+    # Execute the match.
+    # @param doc [Doc] an {Doc} instance
+    # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
+    def match(doc)
+      str_results = @py_matcher.(doc.py_doc).to_s
+      s = StringScanner.new(str_results[1..-2])
+      results = []
+      while s.scan_until(/(\d+), (\d+), (\d+)/)
+        next unless s.matched
+        triple = s.matched.split(", ")
+        match_id = triple[0].to_i
+        start_index = triple[1].to_i
+        end_index = triple[2].to_i - 1
+        results << {match_id: match_id, start_index: start_index, end_index: end_index}
+      end
+      results
+    end
+  end
   # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
   class Span
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_span_id
     # @return [Object] a Python `Span` instance accessible via `PyCall`
     attr_reader :py_span
@@ -35,21 +348,18 @@ module Spacy
     alias_method :size, :count
     # It is recommended to use {Doc#span} method to create a span. If you need to
-    # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
+    # create one using {Span#initialize}, there are two method signatures:
+    # `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
     # @param doc [Doc] the document to which this span belongs to
     # @param start_index [Integer] the index of the item starting the span inside a doc
     # @param end_index [Integer] the index of the item ending the span inside a doc
     # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
     def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
       @doc = doc
-      @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
       if py_span
         @py_span = py_span
       else
-        options = PyCall::Dict.(options)
-        PyCall.exec("#{@spacy_span_id}_opts = #{options}")
-        PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
-        @py_span = PyCall.eval(@spacy_span_id)
+        @py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
       end
     end
@@ -63,7 +373,7 @@ module Spacy
       results
     end
-    # Iterates over the elements in the span yielding a token instance.
+    # Iterates over the elements in the span yielding a token instance each time.
     def each
       PyCall::List.(@py_span).each do |py_token|
         yield Token.new(py_token)
@@ -97,7 +407,6 @@ module Spacy
     def ents
       ent_array = []
       PyCall::List.(@py_span.ents).each do |py_span|
-        # ent_array << ent
         ent_array << Spacy::Span.new(@doc, py_span: py_span)
       end
       ent_array
@@ -106,11 +415,11 @@ module Spacy
     # Returns a span that represents the sentence that the given span is part of.
     # @return [Span]
     def sent
-      py_span =@py_span.sent
+      py_span = @py_span.sent
       return Spacy::Span.new(@doc, py_span: py_span)
     end
-    # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
+    # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
     # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
     def [](range)
       if range.is_a?(Range)
@@ -125,16 +434,16 @@ module Spacy
     # @param other [Span] the other span to which a similarity estimation is conducted
     # @return [Float]
     def similarity(other)
-      PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
+      py_span.similarity(other.py_span)
     end
-    # Creates a document instance
+    # Creates a document instance from the span
     # @return [Doc]
     def as_doc
-      Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
+      Spacy::Doc.new(@doc.py_nlp, text: self.text)
     end
-    # Returns Tokens conjugated to the root of the span.
+    # Returns tokens conjugated to the root of the span.
     # @return [Array<Token>] an array of tokens
     def conjuncts
       conjunct_array = []
@@ -144,7 +453,7 @@ module Spacy
       conjunct_array
     end
-    # Returns Tokens that are to the left of the span, whose heads are within the span.
+    # Returns tokens that are to the left of the span, whose heads are within the span.
     # @return [Array<Token>] an array of tokens
     def lefts
       left_array = []
@@ -189,7 +498,8 @@ module Spacy
     # @return [String] a string representing the token
     attr_reader :text
-    # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
+    # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
+    # There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
     # @param py_token [Object] Python `Token` object
     def initialize(py_token)
       @py_token = py_token
@@ -253,7 +563,7 @@ module Spacy
     end
     # Returns a hash or string of morphological information
-    # @param dict [Boolean] if true, a hash will be returned instead of a string
+    # @param hash [Boolean] if true, a hash will be returned instead of a string
     # @return [Hash, String]
     def morphology(hash = true)
       if @py_token.has_morph
@@ -278,310 +588,6 @@ module Spacy
     end
   end
-  # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
-  class Doc
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_nlp_id
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_doc_id
-    # @return [Object] a Python `Doc` instance accessible via `PyCall`
-    attr_reader :py_doc
-    # @return [String] a text string of the document
-    attr_reader :text
-    include Enumerable
-    alias_method :length, :count
-    alias_method :len, :count
-    alias_method :size, :count
-    # Creates a new instance of {Doc}.
-    # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
-    # @param text [String] The text string to be analyzed
-    def initialize(nlp_id, text)
-      @text = text
-      @spacy_nlp_id = nlp_id
-      @spacy_doc_id = "doc_#{text.object_id}"
-      quoted = text.gsub('"', '\"')
-      PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
-      PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
-      @py_doc = PyCall.eval(@spacy_doc_id)
-    end
-    # Retokenizes the text merging a span into a single token.
-    # @param start_index [Integer] The start position of the span to be retokenized in the document
-    # @param end_index [Integer] The end position of the span to be retokenized in the document
-    # @param attributes [Hash] Attributes to set on the merged token
-    def retokenize(start_index, end_index, attributes = {})
-      py_attrs = PyCall::Dict.(attributes)
-      PyCall.exec(<<PY)
-with #{@spacy_doc_id}.retokenize() as retokenizer:
-    retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
-PY
-      @py_doc = PyCall.eval(@spacy_doc_id)
-    end
-    # Retokenizes the text splitting the specified token.
-    # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
-    # @param split_array [Array<String>] text strings of the split results
-    # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
-    # @param attributes [Hash] The attributes of the split elements
-    def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
-      py_attrs = PyCall::Dict.(attributes)
-      py_split_array = PyCall::List.(split_array)
-      PyCall.exec(<<PY)
-with #{@spacy_doc_id}.retokenize() as retokenizer:
-    heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
-    attrs = #{py_attrs}
-    split_array = #{py_split_array}
-    retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
-PY
-      @py_doc = PyCall.eval(@spacy_doc_id)
-    end
-    # String representation of the token.
-    # @return [String]
-    def to_s
-      @text
-    end
-    # Returns an array of tokens contained in the doc.
-    # @return [Array<Token>]
-    def tokens
-      results = []
-      PyCall::List.(@py_doc).each do |py_token|
-        results << Token.new(py_token)
-      end
-      results
-    end
-    # Iterates over the elements in the doc yielding a token instance.
-    def each
-      PyCall::List.(@py_doc).each do |py_token|
-        yield Token.new(py_token)
-      end
-    end
-    # Returns a span of the specified range within the doc.
-    # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
-    # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
-    # @param optional_size [Integer] An integer representing the size of the span
-    # @return [Span]
-    def span(range_or_start, optional_size = nil)
-      if optional_size
-        start_index = range_or_start
-        temp = tokens[start_index ... start_index + optional_size]
-      else
-        start_index = range_or_start.first
-        range = range_or_start
-        temp = tokens[range]
-      end
-      end_index = start_index + temp.size - 1
-      Span.new(self, start_index: start_index, end_index: end_index)
-    end
-    # Returns an array of spans representing noun chunks.
-    # @return [Array<Span>]
-    def noun_chunks
-      chunk_array = []
-      py_chunks = PyCall::List.(@py_doc.noun_chunks)
-      py_chunks.each do |py_chunk|
-        chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
-      end
-      chunk_array
-    end
-    # Returns an array of spans representing sentences.
-    # @return [Array<Span>]
-    def sents
-      sentence_array = []
-      py_sentences = PyCall::List.(@py_doc.sents)
-      py_sentences.each do |py_sent|
-        sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
-      end
-      sentence_array
-    end
-    # Returns an array of spans representing named entities.
-    # @return [Array<Span>]
-    def ents
-      # so that ents canbe "each"-ed in Ruby
-      ent_array = []
-      PyCall::List.(@py_doc.ents).each do |ent|
-        ent_array << ent
-      end
-      ent_array
-    end
-    # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
-    # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
-    def [](range)
-      if range.is_a?(Range)
-        py_span = @py_doc[range]
-        return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
-      else
-        return Token.new(@py_doc[range])
-      end
-    end
-    # Returns a semantic similarity estimate.
-    # @param other [Doc] the other doc to which a similarity estimation is made
-    # @return [Float]
-    def similarity(other)
-      PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
-    end
-    # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
-    # @param style [String] Either `dep` or `ent`
-    # @param compact [Boolean] Only relevant to the `dep' style
-    # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
-    def displacy(style: "dep", compact: false)
-      PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
-    end
-    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
-    def method_missing(name, *args)
-      @py_doc.send(name, *args)
-    end
-  end
-  # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
-  class Matcher
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_matcher_id
-    # @return [Object] a Python `Matcher` instance accessible via `PyCall`
-    attr_reader :py_matcher
-    # Creates a {Matcher} instance
-    # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
-    def initialize(nlp_id)
-      @spacy_matcher_id = "doc_#{nlp_id}_matcher"
-      PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
-      @py_matcher = PyCall.eval(@spacy_matcher_id)
-    end
-    # Adds a label string and a text pattern.
-    # @param text [String] a label string given to the pattern
-    # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
-    def add(text, pattern)
-      @py_matcher.add(text, pattern)
-    end
-    # Execute the match.
-    # @param doc [Doc] An {Doc} instance
-    # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
-    def match(doc)
-      str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
-      s = StringScanner.new(str_results[1..-2])
-      results = []
-      while s.scan_until(/(\d+), (\d+), (\d+)/)
-        next unless s.matched
-        triple = s.matched.split(", ")
-        match_id = triple[0].to_i
-        start_index = triple[1].to_i
-        end_index = triple[2].to_i - 1
-        results << {match_id: match_id, start_index: start_index, end_index: end_index}
-      end
-      results
-    end
-  end
-  # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
-  class Language
-    # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
-    attr_reader :spacy_nlp_id
-    # @return [Object] a Python `Language` instance accessible via `PyCall`
-    attr_reader :py_nlp
-    # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
-    # @param model [String] A language model installed in the system
-    def initialize(model = "en_core_web_sm")
-      @spacy_nlp_id = "nlp_#{model.object_id}"
-      PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
-      PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
-      @py_nlp = PyCall.eval(@spacy_nlp_id)
-    end
-    # Reads and analyze the given text.
-    # @param text [String] A text to be read and analyzed
-    def read(text)
-      Doc.new(@spacy_nlp_id, text)
-    end
-    # Generates a matcher for the current language model.
-    # @return [Matcher]
-    def matcher
-      Matcher.new(@spacy_nlp_id)
-    end
-    # A utility method to lookup a vocabulary item of the given id.
-    # @param id [Integer] A vocabulary id
-    # @return [Object] A Python `Lexeme` object
-    def vocab_string_lookup(id)
-      PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
-    end
-    # A utility method to list pipeline components.
-    # @return [Array<String>] An array of text strings representing pipeline components
-    def pipe_names
-      pipe_array = []
-      PyCall::List.(@py_nlp.pipe_names).each do |pipe|
-        pipe_array << pipe
-      end
-      pipe_array
-    end
-    # A utility method to get the tokenizer Python object.
-    # @return [Object] Python `Tokenizer` object
-    def tokenizer
-      return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
-    end
-    # A utility method to get a Python `Lexeme` object.
-    # @param text [String] A text string representing a lexeme
-    # @return [Object] Python `Tokenizer` object
-    def get_lexeme(text)
-      text = text.gsub("'", "\'")
-      py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
-      return py_lexeme
-    end
-    # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
-    # @param vector [Object] A vector representation of a word (whether existing or non-existing)
-    # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
-    def most_similar(vector, n)
-      vec_array = Numpy.asarray([vector])
-      py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
-      key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
-      keys = key_texts.map{|kt| kt[0]}
-      texts = key_texts.map{|kt| kt[1]}
-      best_rows = PyCall::List.(py_result[1])[0]
-      scores = PyCall::List.(py_result[2])[0]
-      results = []
-      n.times do |i|
-        results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
-      end
-      results
-    end
-    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
-    def method_missing(name, *args)
-      @py_nlp.send(name, *args)
-    end
-  end
 end

data/lib/ruby-spacy/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 module Spacy
   # The version number of the module
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ruby-spacy
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-06-26 00:00:00.000000000 Z
+date: 2021-06-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pycall
@@ -66,6 +66,7 @@ extra_rdoc_files: []
 files:
 - ".gitignore"
 - ".yardopts"
+- CHANGELOG.md
 - Gemfile
 - Gemfile.lock
 - LICENSE.txt
@@ -123,7 +124,6 @@ files:
 - examples/linguistic_features/sentence_segmentation.rb
 - examples/linguistic_features/similarity.rb
 - examples/linguistic_features/similarity_between_spans.rb
-- examples/linguistic_features/special_case_tokenization_rules.rb
 - examples/linguistic_features/tokenization.rb
 - examples/rule_based_matching/creating_spans_from_matches.rb
 - examples/rule_based_matching/matcher.rb
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.11
+rubygems_version: 3.2.3
 signing_key:
 specification_version: 4
 summary: A wrapper module for using spaCy natural language processing library from

data/examples/linguistic_features/special_case_tokenization_rules.rb DELETED Viewed

@@ -1,19 +0,0 @@
-require "ruby-spacy"
-require "terminal-table"
-nlp = Spacy::Language.new("en_core_web_sm")
-doc = nlp.read("gimme that")
-puts doc.tokens.join(" ")
-# Add special case rule
-special_case = [{ORTH: "gim"}, {ORTH: "me"}]
-tokenizer = nlp.tokenizer
-tokenizer.add_special_case("gimme", special_case)
-# Check new tokenization
-puts nlp.read("gimme that").tokens.join(" ")
-# gimme that
-# gim me that