RubyGems - ruby-spacy - Versions diffs - 0.2.3 → 0.4.0 - Mend

ruby-spacy 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.github/FUNDING.yml +6 -0
data/.gitignore +1 -0
data/CHANGELOG.md +24 -7
data/Gemfile +1 -1
data/README.md +120 -22
data/lib/ruby-spacy/openai_client.rb +166 -0
data/lib/ruby-spacy/openai_helper.rb +91 -0
data/lib/ruby-spacy/version.rb +1 -1
data/lib/ruby-spacy.rb +455 -248
data/ruby-spacy.gemspec +3 -2
metadata +34 -20

data/lib/ruby-spacy.rb CHANGED Viewed

@@ -1,27 +1,29 @@
 # frozen_string_literal: true
 require_relative "ruby-spacy/version"
+require_relative "ruby-spacy/openai_client"
+require_relative "ruby-spacy/openai_helper"
 require "numpy"
-require "openai"
 require "pycall"
-require "strscan"
 require "timeout"
-begin
-  PyCall.init
-  _spacy = PyCall.import_module("spacy")
-rescue PyCall::PyError => e
-  puts "Failed to initialize PyCall or import spacy: #{e.message}"
-  puts "Python traceback:"
-  puts e.traceback
-  raise
-end
+require "json"
+require "base64"
 # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
 module Spacy
   MAX_RETRIAL = 5
-  spacy = PyCall.import_module("spacy")
+  begin
+    PyCall.init
+    spacy = PyCall.import_module("spacy")
+  rescue PyCall::PyError => e
+    puts "Failed to initialize PyCall or import spacy: #{e.message}"
+    puts "Python traceback:"
+    puts e.traceback
+    raise
+  end
+  Builtins = PyCall.import_module("builtins")
   SpacyVersion = spacy.__version__
   # Python `Language` class
@@ -39,6 +41,9 @@ module Spacy
   # Python `Matcher` class object
   PyMatcher = spacy.matcher.Matcher
+  # Python `PhraseMatcher` class object
+  PyPhraseMatcher = spacy.matcher.PhraseMatcher
   # Python `displacy` object
   PyDisplacy = PyCall.import_module('spacy.displacy')
@@ -49,16 +54,15 @@ module Spacy
     PyCall::List.call(py_generator)
   end
-  @openai_client = nil
-  def self.openai_client(access_token:)
-    # If @client is already set, just return it. Otherwise, create a new instance.
-    @openai_client ||= OpenAI::Client.new(access_token: access_token)
-  end
-  # Provide an accessor method to get the client (optional)
-  def self.client
-    @openai_client
+  # Checks if a Python object has a given attribute using builtins.hasattr.
+  # Falls back to true if the check itself fails (e.g. due to PyCall issues).
+  # @param py_obj [Object] a Python object
+  # @param attr [String, Symbol] the attribute name to check
+  # @return [Boolean]
+  def self.py_hasattr?(py_obj, attr)
+    Builtins.hasattr(py_obj, attr.to_s)
+  rescue StandardError
+    true
   end
   # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
@@ -84,17 +88,19 @@ module Spacy
     # @param nlp [Language] an instance of {Language} class
     # @param py_doc [Object] an instance of Python `Doc` class
     # @param text [String] the text string to be analyzed
-    def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL,
-                   retrial: 0)
+    def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL)
       @py_nlp = nlp
-      @py_doc = py_doc || @py_doc = nlp.call(text)
-      @text = @py_doc.text
-    rescue StandardError
-      retrial += 1
-      raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
+      retrial = 0
+      begin
+        @py_doc = py_doc || nlp.call(text)
+        @text = @py_doc.text
+      rescue StandardError
+        retrial += 1
+        raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
-      sleep 0.5
-      initialize(nlp, py_doc: py_doc, text: text, max_retrial: max_retrial, retrial: retrial)
+        sleep 0.5
+        retry
+      end
     end
     # Retokenizes the text merging a span into a single token.
@@ -128,11 +134,7 @@ module Spacy
     # Returns an array of tokens contained in the doc.
     # @return [Array<Token>]
     def tokens
-      results = []
-      PyCall::List.call(@py_doc).each do |py_token|
-        results << Token.new(py_token)
-      end
-      results
+      PyCall::List.call(@py_doc).map { |py_token| Token.new(py_token) }
     end
     # Iterates over the elements in the doc yielding a token instance each time.
@@ -148,54 +150,50 @@ module Spacy
     # @param optional_size [Integer] an integer representing the size of the span
     # @return [Span]
     def span(range_or_start, optional_size = nil)
+      doc_len = PyCall.len(@py_doc)
       if optional_size
         start_index = range_or_start
-        temp = tokens[start_index...start_index + optional_size]
+        start_index += doc_len if start_index < 0
+        end_index = start_index + optional_size - 1
       else
-        start_index = range_or_start.first
         range = range_or_start
-        temp = tokens[range]
+        start_index = range.first
+        start_index += doc_len if start_index < 0
+        end_val = range.end
+        if end_val.nil?
+          end_index = doc_len - 1
+        else
+          end_val += doc_len if end_val < 0
+          end_index = range.exclude_end? ? end_val - 1 : end_val
+        end
       end
-      end_index = start_index + temp.size - 1
       Span.new(self, start_index: start_index, end_index: end_index)
     end
     # Returns an array of spans representing noun chunks.
     # @return [Array<Span>]
     def noun_chunks
-      chunk_array = []
-      py_chunks = PyCall::List.call(@py_doc.noun_chunks)
-      py_chunks.each do |py_chunk|
-        chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
+      PyCall::List.call(@py_doc.noun_chunks).map do |py_chunk|
+        Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
       end
-      chunk_array
     end
     # Returns an array of spans each representing a sentence.
     # @return [Array<Span>]
     def sents
-      sentence_array = []
-      py_sentences = PyCall::List.call(@py_doc.sents)
-      py_sentences.each do |py_sent|
-        sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
+      PyCall::List.call(@py_doc.sents).map do |py_sent|
+        Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
       end
-      sentence_array
     end
     # Returns an array of spans each representing a named entity.
     # @return [Array<Span>]
     def ents
-      # so that ents canbe "each"-ed in Ruby
-      ent_array = []
-      PyCall::List.call(@py_doc.ents).each do |ent|
-        ent.define_singleton_method :label do
-          label_
-        end
-        ent_array << ent
+      PyCall::List.call(@py_doc.ents).map do |py_span|
+        Span.new(self, py_span: py_span)
       end
-      ent_array
     end
     # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
@@ -216,6 +214,31 @@ module Spacy
       py_doc.similarity(other.py_doc)
     end
+    # Serializes the doc to a binary string.
+    # The binary data includes all annotations (tokens, entities, etc.) and can be
+    # used to restore the doc later without re-processing.
+    # @return [String] binary representation of the doc
+    # @example Save doc to file
+    #   doc = nlp.read("Hello world")
+    #   File.binwrite("doc.bin", doc.to_bytes)
+    def to_bytes
+      @py_doc.to_bytes.force_encoding(Encoding::BINARY)
+    end
+    # Restores a doc from binary data created by {#to_bytes}.
+    # This is useful for caching processed documents to avoid re-processing.
+    # @param byte_string [String] binary data from {#to_bytes}
+    # @return [Doc] the restored doc
+    # @example Load doc from file
+    #   bytes = File.binread("doc.bin")
+    #   doc = Spacy::Doc.from_bytes(nlp, bytes)
+    def self.from_bytes(nlp, byte_string)
+      b64 = Base64.strict_encode64(byte_string)
+      py_bytes = PyCall.eval("__import__('base64').b64decode('#{b64}')")
+      py_doc = nlp.py_nlp.call("").from_bytes(py_bytes)
+      new(nlp.py_nlp, py_doc: py_doc)
+    end
     # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
     # @param style [String] either `dep` or `ent`
     # @param compact [Boolean] only relevant to the `dep' style
@@ -224,12 +247,86 @@ module Spacy
       PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
     end
+    # Generates a JSON string summarizing the linguistic analysis of the document.
+    # Designed to be passed as context to an LLM (e.g., via {OpenAIHelper#chat}).
+    #
+    # @param sections [Array<Symbol>] which sections to include
+    #   (:text, :tokens, :entities, :noun_chunks, :sentences)
+    # @param token_attributes [Array<Symbol>] which token attributes to include
+    #   (:text, :lemma, :pos, :tag, :dep, :head, :ent_type, :morphology)
+    # @return [String] a JSON string of the linguistic summary
+    def linguistic_summary(sections: [:text, :tokens, :entities, :noun_chunks],
+                           token_attributes: [:text, :lemma, :pos, :dep, :head])
+      result = {}
+      sections.each do |section|
+        case section
+        when :text
+          result[:text] = @text
+        when :tokens
+          result[:tokens] = tokens.map do |token|
+            token_hash = {}
+            token_attributes.each do |attr|
+              case attr
+              when :head
+                token_hash[:head] = token.head.text
+              when :morphology
+                # Use string form and parse to ensure a plain Ruby Hash for JSON serialization
+                morph_str = token.morphology(hash: false)
+                token_hash[:morphology] = if morph_str.empty?
+                                            {}
+                                          else
+                                            morph_str.split("|").each_with_object({}) do |pair, h|
+                                              k, v = pair.split("=", 2)
+                                              h[k] = v
+                                            end
+                                          end
+              else
+                token_hash[attr] = token.send(attr)
+              end
+            end
+            token_hash
+          end
+        when :entities
+          ent_list = ents
+          result[:entities] = ent_list.map do |ent|
+            { text: ent.text, label: ent.label }
+          end
+        when :noun_chunks
+          result[:noun_chunks] = noun_chunks.map do |chunk|
+            { text: chunk.text, root: chunk.root.text }
+          end
+        when :sentences
+          result[:sentences] = sents.map(&:text)
+        end
+      end
+      result.to_json
+    end
+    # Sends a query to OpenAI's chat completion API with optional tool support.
+    # The get_tokens tool allows the model to request token-level linguistic analysis.
+    #
+    # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
+    # @param max_completion_tokens [Integer] Maximum tokens in the response
+    # @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
+    # @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
+    # @param model [String] The model to use (default: gpt-5-mini)
+    # @param messages [Array<Hash>] Conversation history (for recursive tool calls). Note: this array is modified in place when tool calls occur.
+    # @param prompt [String, nil] System prompt for the query
+    # @return [String, nil] The model's response content
     def openai_query(access_token: nil,
-                     max_tokens: 1000,
+                     max_completion_tokens: nil,
+                     max_tokens: nil,
                      temperature: 0.7,
-                     model: "gpt-4o-mini",
+                     model: "gpt-5-mini",
                      messages: [],
-                     prompt: nil)
+                     prompt: nil,
+                     response_format: nil,
+                     max_tool_call_depth: 5,
+                     _tool_call_depth: 0)
+      # Support both max_completion_tokens and max_tokens for backward compatibility
+      max_completion_tokens ||= max_tokens || 1000
       if messages.empty?
         messages = [
           { role: "system", content: prompt },
@@ -237,122 +334,164 @@ module Spacy
         ]
       end
-      access_token ||= ENV["OPENAI_API_KEY"]
-      raise "Error: OPENAI_API_KEY is not set" unless access_token
-      begin
-        response = Spacy.openai_client(access_token: access_token).chat(
-          parameters: {
-            model: model,
-            messages: messages,
-            max_tokens: max_tokens,
-            temperature: temperature,
-            function_call: "auto",
-            stream: false,
-            functions: [
-              {
-                name: "get_tokens",
-                description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
-                "parameters": {
-                  "type": "object",
-                  "properties": {
-                    "text": {
-                      "type": "string",
-                      "description": "text to be tokenized"
-                    }
-                  },
-                  "required": ["text"]
-                }
+      client = openai_client(access_token)
+      # Tool definition for token analysis (GPT-5 tools API format)
+      tools = nil
+      tool_choice = nil
+      if _tool_call_depth < max_tool_call_depth
+        tools = [
+          {
+            type: "function",
+            function: {
+              name: "get_tokens",
+              description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
+              parameters: {
+                type: "object",
+                properties: {
+                  text: {
+                    type: "string",
+                    description: "text to be tokenized"
+                  }
+                },
+                required: ["text"]
               }
-            ]
+            }
           }
-        )
+        ]
+        tool_choice = "auto"
+      end
+      response = client.chat(
+        model: model,
+        messages: messages,
+        max_completion_tokens: max_completion_tokens,
+        temperature: temperature,
+        tools: tools,
+        tool_choice: tool_choice,
+        response_format: response_format
+      )
+      message = response.dig("choices", 0, "message")
-        message = response.dig("choices", 0, "message")
+      # Handle tool calls (GPT-5 format)
+      if message["tool_calls"] && !message["tool_calls"].empty?
+        messages << message
+        message["tool_calls"].each do |tool_call|
+          function_name = tool_call.dig("function", "name")
+          tool_call_id = tool_call["id"]
-        if message["role"] == "assistant" && message["function_call"]
-          messages << message
-          function_name = message.dig("function_call", "name")
-          _args = JSON.parse(message.dig("function_call", "arguments"))
           case function_name
           when "get_tokens"
-            res = tokens.map do |t|
+            result = tokens.map do |t|
               {
-                "surface": t.text,
-                "lemma": t.lemma,
-                "pos": t.pos,
-                "tag": t.tag,
-                "dep": t.dep,
-                "ent_type": t.ent_type,
-                "morphology": t.morphology
+                surface: t.text,
+                lemma: t.lemma,
+                pos: t.pos,
+                tag: t.tag,
+                dep: t.dep,
+                ent_type: t.ent_type,
+                morphology: t.morphology
               }
             end.to_json
+            messages << {
+              role: "tool",
+              tool_call_id: tool_call_id,
+              content: result
+            }
           end
-          messages << { role: "system", content: res }
-          openai_query(access_token: access_token, max_tokens: max_tokens,
-                       temperature: temperature, model: model,
-                       messages: messages, prompt: prompt)
-        else
-          message["content"]
         end
-      rescue StandardError => e
-        puts "Error: OpenAI API call failed."
-        pp e.message
-        pp e.backtrace
+        # Recursive call to get final response after tool execution
+        openai_query(
+          access_token: access_token,
+          max_completion_tokens: max_completion_tokens,
+          temperature: temperature,
+          model: model,
+          messages: messages,
+          prompt: prompt,
+          response_format: response_format,
+          max_tool_call_depth: max_tool_call_depth,
+          _tool_call_depth: _tool_call_depth + 1
+        )
+      else
+        message["content"]
       end
-    end
+    rescue OpenAIClient::APIError => e
+      puts "Error: OpenAI API call failed - #{e.message}"
+      nil
+    end
+    # Sends a text completion request to OpenAI's chat API.
+    #
+    # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
+    # @param max_completion_tokens [Integer] Maximum tokens in the response
+    # @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
+    # @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
+    # @param model [String] The model to use (default: gpt-5-mini)
+    # @return [String, nil] The completed text
+    def openai_completion(access_token: nil, max_completion_tokens: nil, max_tokens: nil, temperature: 0.7, model: "gpt-5-mini")
+      # Support both max_completion_tokens and max_tokens for backward compatibility
+      max_completion_tokens ||= max_tokens || 1000
-    def openai_completion(access_token: nil, max_tokens: 1000, temperature: 0.7, model: "gpt-4o-mini")
       messages = [
         { role: "system", content: "Complete the text input by the user." },
         { role: "user", content: @text }
       ]
-      access_token ||= ENV["OPENAI_API_KEY"]
-      raise "Error: OPENAI_API_KEY is not set" unless access_token
-      begin
-        response = Spacy.openai_client(access_token: access_token).chat(
-          parameters: {
-            model: model,
-            messages: messages,
-            max_tokens: max_tokens,
-            temperature: temperature
-          }
-        )
-        response.dig("choices", 0, "message", "content")
-      rescue StandardError => e
-        puts "Error: OpenAI API call failed."
-        pp e.message
-        pp e.backtrace
-      end
-    end
-    def openai_embeddings(access_token: nil, model: "text-embedding-ada-002")
+      client = openai_client(access_token)
+      response = client.chat(
+        model: model,
+        messages: messages,
+        max_completion_tokens: max_completion_tokens,
+        temperature: temperature
+      )
+      response.dig("choices", 0, "message", "content")
+    rescue OpenAIClient::APIError => e
+      puts "Error: OpenAI API call failed - #{e.message}"
+      nil
+    end
+    # Generates text embeddings using OpenAI's embeddings API.
+    #
+    # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
+    # @param model [String] The embeddings model (default: text-embedding-3-small)
+    # @param dimensions [Integer, nil] The number of dimensions for the output embeddings (nil uses model default)
+    # @return [Array<Float>, nil] The embedding vector
+    def openai_embeddings(access_token: nil, model: "text-embedding-3-small", dimensions: nil)
+      client = openai_client(access_token)
+      response = client.embeddings(model: model, input: @text, dimensions: dimensions)
+      response.dig("data", 0, "embedding")
+    rescue OpenAIClient::APIError => e
+      puts "Error: OpenAI API call failed - #{e.message}"
+      nil
+    end
+    private
+    def openai_client(access_token)
       access_token ||= ENV["OPENAI_API_KEY"]
       raise "Error: OPENAI_API_KEY is not set" unless access_token
-      begin
-        response = Spacy.openai_client(access_token: access_token).embeddings(
-          parameters: {
-            model: model,
-            input: @text
-          }
-        )
-        response.dig("data", 0, "embedding")
-      rescue StandardError => e
-        puts "Error: OpenAI API call failed."
-        pp e.message
-        pp e.backtrace
-      end
+      @openai_clients ||= {}
+      @openai_clients[access_token] ||= OpenAIClient.new(access_token: access_token)
     end
+    public
     # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
     def method_missing(name, *args)
       @py_doc.send(name, *args)
     end
-    def respond_to_missing?(sym)
-      sym ? true : super
+    def respond_to_missing?(sym, include_private = false)
+      Spacy.py_hasattr?(@py_doc, sym) || super
+    end
+    def instance_variables_to_inspect
+      [:@text]
     end
   end
@@ -366,8 +505,13 @@ module Spacy
     # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
     # @param model [String] A language model installed in the system
-    def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, retrial: 0, timeout: 60)
+    def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, timeout: 60)
+      unless model.to_s.match?(/\A[a-zA-Z0-9_\-\.\/]+\z/)
+        raise ArgumentError, "Invalid model name: #{model.inspect}"
+      end
       @spacy_nlp_id = "nlp_#{model.object_id}"
+      retrial = 0
       begin
         Timeout.timeout(timeout) do
           PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
@@ -398,21 +542,29 @@ module Spacy
       Matcher.new(@py_nlp)
     end
+    # Generates a phrase matcher for the current language model.
+    # PhraseMatcher is more efficient than {Matcher} for matching large terminology lists.
+    # @param attr [String] the token attribute to match on (default: "ORTH").
+    #   Use "LOWER" for case-insensitive matching.
+    # @return [PhraseMatcher]
+    # @example
+    #   matcher = nlp.phrase_matcher(attr: "LOWER")
+    #   matcher.add("PRODUCT", ["iPhone", "MacBook Pro"])
+    def phrase_matcher(attr: "ORTH")
+      PhraseMatcher.new(self, attr: attr)
+    end
     # A utility method to lookup a vocabulary item of the given id.
     # @param id [Integer] a vocabulary id
     # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
     def vocab_string_lookup(id)
-      PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
+      PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{Integer(id)}]")
     end
     # A utility method to list pipeline components.
     # @return [Array<String>] An array of text strings representing pipeline components
     def pipe_names
-      pipe_array = []
-      PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
-        pipe_array << pipe
-      end
-      pipe_array
+      PyCall::List.call(@py_nlp.pipe_names).to_a
     end
     # A utility method to get a Python `Lexeme` object.
@@ -461,20 +613,62 @@ module Spacy
     # @param batch_size [Integer]
     # @return [Array<Doc>]
     def pipe(texts, disable: [], batch_size: 50)
-      docs = []
-      PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
-        docs << Doc.new(@py_nlp, py_doc: py_doc)
+      PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).map do |py_doc|
+        Doc.new(@py_nlp, py_doc: py_doc)
       end
-      docs
     end
-    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
+    # Yields an {OpenAIHelper} instance for making OpenAI API calls within a block.
+    # The helper is configured once and reused for all calls within the block,
+    # making it efficient for batch processing with {#pipe}.
+    #
+    # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
+    # @param model [String] the default model for chat requests
+    # @param max_completion_tokens [Integer] default maximum tokens in responses
+    # @param temperature [Float] default sampling temperature
+    # @yield [OpenAIHelper] the helper instance for making API calls
+    # @return [Object] the block's return value
+    # @example Batch processing with pipe
+    #   nlp.with_openai(model: "gpt-5-mini") do |ai|
+    #     nlp.pipe(texts).map do |doc|
+    #       ai.chat(system: "Analyze.", user: doc.linguistic_summary)
+    #     end
+    #   end
+    def with_openai(access_token: nil, model: "gpt-5-mini",
+                    max_completion_tokens: 1000, temperature: 0.7)
+      helper = OpenAIHelper.new(
+        access_token: access_token,
+        model: model,
+        max_completion_tokens: max_completion_tokens,
+        temperature: temperature
+      )
+      yield helper
+    end
+    # Executes a block within spaCy's memory zone for efficient memory management.
+    # Requires spaCy >= 3.8.
+    # @yield the block to execute within the memory zone
+    # @raise [NotImplementedError] if spaCy version does not support memory zones
+    def memory_zone(&block)
+      major, minor = SpacyVersion.split(".").map(&:to_i)
+      unless major > 3 || (major == 3 && minor >= 8)
+        raise NotImplementedError, "memory_zone requires spaCy >= 3.8 (current: #{SpacyVersion})"
+      end
+      PyCall.with(@py_nlp.memory_zone, &block)
+    end
+    # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
     def method_missing(name, *args)
       @py_nlp.send(name, *args)
     end
-    def respond_to_missing?(sym)
-      sym ? true : super
+    def respond_to_missing?(sym, include_private = false)
+      Spacy.py_hasattr?(@py_nlp, sym) || super
+    end
+    def instance_variables_to_inspect
+      [:@spacy_nlp_id]
     end
   end
@@ -500,19 +694,52 @@ module Spacy
     # @param doc [Doc] an {Doc} instance
     # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
     def match(doc)
-      str_results = @py_matcher.call(doc.py_doc).to_s
-      s = StringScanner.new(str_results[1..-2])
-      results = []
-      while s.scan_until(/(\d+), (\d+), (\d+)/)
-        next unless s.matched
-        triple = s.matched.split(", ")
-        match_id = triple[0].to_i
-        start_index = triple[1].to_i
-        end_index = triple[2].to_i - 1
-        results << { match_id: match_id, start_index: start_index, end_index: end_index }
+      PyCall::List.call(@py_matcher.call(doc.py_doc)).map do |py_match|
+        { match_id: py_match[0].to_i, start_index: py_match[1].to_i, end_index: py_match[2].to_i - 1 }
       end
-      results
+    end
+  end
+  # See also spaCy Python API document for [`PhraseMatcher`](https://spacy.io/api/phrasematcher).
+  # PhraseMatcher is useful for efficiently matching large terminology lists.
+  # It's faster than {Matcher} when matching many phrase patterns.
+  class PhraseMatcher
+    # @return [Object] a Python `PhraseMatcher` instance accessible via `PyCall`
+    attr_reader :py_matcher
+    # @return [Language] the language model used by this matcher
+    attr_reader :nlp
+    # Creates a {PhraseMatcher} instance.
+    # @param nlp [Language] an instance of {Language} class
+    # @param attr [String] the token attribute to match on (default: "ORTH").
+    #   Use "LOWER" for case-insensitive matching.
+    # @example Case-insensitive matching
+    #   matcher = Spacy::PhraseMatcher.new(nlp, attr: "LOWER")
+    def initialize(nlp, attr: "ORTH")
+      @nlp = nlp
+      @py_matcher = PyPhraseMatcher.call(nlp.py_nlp.vocab, attr: attr)
+    end
+    # Adds phrase patterns to the matcher.
+    # @param label [String] a label string given to the patterns
+    # @param phrases [Array<String>] an array of phrase strings to match
+    # @example Add product names
+    #   matcher.add("PRODUCT", ["iPhone", "MacBook Pro", "iPad"])
+    def add(label, phrases)
+      patterns = phrases.map { |phrase| @nlp.py_nlp.call(phrase) }
+      @py_matcher.add(label, patterns)
+    end
+    # Execute the phrase match and return matching spans.
+    # @param doc [Doc] a {Doc} instance to search
+    # @return [Array<Span>] an array of {Span} objects with labels
+    # @example Find matches
+    #   matches = matcher.match(doc)
+    #   matches.each { |span| puts "#{span.text} => #{span.label}" }
+    def match(doc)
+      py_matches = @py_matcher.call(doc.py_doc, as_spans: true)
+      PyCall::List.call(py_matches).map { |py_span| Span.new(doc, py_span: py_span) }
     end
   end
@@ -524,6 +751,9 @@ module Spacy
     # @return [Doc] the document to which the span belongs
     attr_reader :doc
+    # @return [String] a text string of the span
+    attr_reader :text
     include Enumerable
     alias length count
@@ -539,17 +769,14 @@ module Spacy
     # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
     def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
       @doc = doc
-      @py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
+      @py_span = py_span || PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
+      @text = @py_span.text
     end
     # Returns an array of tokens contained in the span.
     # @return [Array<Token>]
     def tokens
-      results = []
-      PyCall::List.call(@py_span).each do |py_token|
-        results << Token.new(py_token)
-      end
-      results
+      PyCall::List.call(@py_span).map { |py_token| Token.new(py_token) }
     end
     # Iterates over the elements in the span yielding a token instance each time.
@@ -562,12 +789,9 @@ module Spacy
     # Returns an array of spans of noun chunks.
     # @return [Array<Span>]
     def noun_chunks
-      chunk_array = []
-      py_chunks = PyCall::List.call(@py_span.noun_chunks)
-      py_chunks.each do |py_span|
-        chunk_array << Span.new(@doc, py_span: py_span)
+      PyCall::List.call(@py_span.noun_chunks).map do |py_span|
+        Span.new(@doc, py_span: py_span)
       end
-      chunk_array
     end
     # Returns the head token
@@ -579,22 +803,17 @@ module Spacy
     # Returns an array of spans that represents sentences.
     # @return [Array<Span>]
     def sents
-      sentence_array = []
-      py_sentences = PyCall::List.call(@py_span.sents)
-      py_sentences.each do |py_span|
-        sentence_array << Span.new(@doc, py_span: py_span)
+      PyCall::List.call(@py_span.sents).map do |py_span|
+        Span.new(@doc, py_span: py_span)
       end
-      sentence_array
     end
     # Returns an array of spans that represents named entities.
     # @return [Array<Span>]
     def ents
-      ent_array = []
-      PyCall::List.call(@py_span.ents).each do |py_span|
-        ent_array << Span.new(@doc, py_span: py_span)
+      PyCall::List.call(@py_span.ents).map do |py_span|
+        Span.new(@doc, py_span: py_span)
       end
-      ent_array
     end
     # Returns a span that represents the sentence that the given span is part of.
@@ -631,41 +850,25 @@ module Spacy
     # Returns tokens conjugated to the root of the span.
     # @return [Array<Token>] an array of tokens
     def conjuncts
-      conjunct_array = []
-      PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
-        conjunct_array << Token.new(py_conjunct)
-      end
-      conjunct_array
+      PyCall::List.call(@py_span.conjuncts).map { |py_conjunct| Token.new(py_conjunct) }
     end
     # Returns tokens that are to the left of the span, whose heads are within the span.
     # @return [Array<Token>] an array of tokens
     def lefts
-      left_array = []
-      PyCall::List.call(@py_span.lefts).each do |py_left|
-        left_array << Token.new(py_left)
-      end
-      left_array
+      PyCall::List.call(@py_span.lefts).map { |py_left| Token.new(py_left) }
     end
     # Returns Tokens that are to the right of the span, whose heads are within the span.
     # @return [Array<Token>] an array of Tokens
     def rights
-      right_array = []
-      PyCall::List.call(@py_span.rights).each do |py_right|
-        right_array << Token.new(py_right)
-      end
-      right_array
+      PyCall::List.call(@py_span.rights).map { |py_right| Token.new(py_right) }
     end
     # Returns Tokens that are within the span and tokens that descend from them.
     # @return [Array<Token>] an array of tokens
     def subtree
-      subtree_array = []
-      PyCall::List.call(@py_span.subtree).each do |py_subtree|
-        subtree_array << Token.new(py_subtree)
-      end
-      subtree_array
+      PyCall::List.call(@py_span.subtree).map { |py_subtree| Token.new(py_subtree) }
     end
     # Returns the label
@@ -674,13 +877,23 @@ module Spacy
       @py_span.label_
     end
+    # String representation of the span.
+    # @return [String]
+    def to_s
+      @text
+    end
     # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
     def method_missing(name, *args)
       @py_span.send(name, *args)
     end
-    def respond_to_missing?(sym)
-      sym ? true : super
+    def respond_to_missing?(sym, include_private = false)
+      Spacy.py_hasattr?(@py_span, sym) || super
+    end
+    def instance_variables_to_inspect
+      [:@text]
     end
   end
@@ -700,6 +913,12 @@ module Spacy
       @text = @py_token.text
     end
+    # Returns the character offset of the token within the parent document.
+    # @return [Integer]
+    def idx
+      @py_token.idx
+    end
     # Returns the head token
     # @return [Token]
     def head
@@ -709,51 +928,31 @@ module Spacy
     # Returns the token in question and the tokens that descend from it.
     # @return [Array<Token>] an array of tokens
     def subtree
-      descendant_array = []
-      PyCall::List.call(@py_token.subtree).each do |descendant|
-        descendant_array << Token.new(descendant)
-      end
-      descendant_array
+      PyCall::List.call(@py_token.subtree).map { |descendant| Token.new(descendant) }
     end
     # Returns the token's ancestors.
     # @return [Array<Token>] an array of tokens
     def ancestors
-      ancestor_array = []
-      PyCall::List.call(@py_token.ancestors).each do |ancestor|
-        ancestor_array << Token.new(ancestor)
-      end
-      ancestor_array
+      PyCall::List.call(@py_token.ancestors).map { |ancestor| Token.new(ancestor) }
     end
     # Returns a sequence of the token's immediate syntactic children.
     # @return [Array<Token>] an array of tokens
     def children
-      child_array = []
-      PyCall::List.call(@py_token.children).each do |child|
-        child_array << Token.new(child)
-      end
-      child_array
+      PyCall::List.call(@py_token.children).map { |child| Token.new(child) }
     end
     # The leftward immediate children of the word in the syntactic dependency parse.
     # @return [Array<Token>] an array of tokens
     def lefts
-      token_array = []
-      PyCall::List.call(@py_token.lefts).each do |token|
-        token_array << Token.new(token)
-      end
-      token_array
+      PyCall::List.call(@py_token.lefts).map { |token| Token.new(token) }
     end
     # The rightward immediate children of the word in the syntactic dependency parse.
     # @return [Array<Token>] an array of tokens
     def rights
-      token_array = []
-      PyCall::List.call(@py_token.rights).each do |token|
-        token_array << Token.new(token)
-      end
-      token_array
+      PyCall::List.call(@py_token.rights).map { |token| Token.new(token) }
     end
     # String representation of the token.
@@ -845,8 +1044,12 @@ module Spacy
       @py_token.send(name, *args)
     end
-    def respond_to_missing?(sym)
-      sym ? true : super
+    def respond_to_missing?(sym, include_private = false)
+      Spacy.py_hasattr?(@py_token, sym) || super
+    end
+    def instance_variables_to_inspect
+      [:@text]
     end
   end
@@ -920,8 +1123,12 @@ module Spacy
       @py_lexeme.send(name, *args)
     end
-    def respond_to_missing?(sym)
-      sym ? true : super
+    def respond_to_missing?(sym, include_private = false)
+      Spacy.py_hasattr?(@py_lexeme, sym) || super
+    end
+    def instance_variables_to_inspect
+      [:@text]
     end
   end
 end