RubyGems - mini_search - Versions diffs - 1.0.3 → 1.2.0 - Mend

mini_search 1.0.3 → 1.2.0

Files changed (13) hide show

checksums.yaml +4 -4
data/.devcontainer/devcontainer.json +31 -0
data/Gemfile.lock +22 -18
data/README.md +116 -36
data/bin/console +1 -0
data/lib/mini_search/compound_tokenizer.rb +15 -0
data/lib/mini_search/inverted_index.rb +2 -2
data/lib/mini_search/ngram_tokenizer.rb +15 -0
data/lib/mini_search/pipeline.rb +13 -2
data/lib/mini_search/version.rb +1 -1
data/lib/mini_search.rb +42 -11
data/mini_search.gemspec +3 -2
metadata +27 -10

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: acd453c483524bd3ab16323edf2d237a6c037ac6aac48d365f395c80a343a606
-  data.tar.gz: '0681c134db08c7b354ef53ef6670aa7d984de43fbb959175cf94f0efc7d0d86c'
+  metadata.gz: 45bd263c2e9f4056e9401efe9f26bf735cafb41cdb9fef570e7426233ff6a92d
+  data.tar.gz: d29e30b91edf2434cb99d219b83b1dbba26e47190cf53be01a32355992af5042
 SHA512:
-  metadata.gz: 726b0e4d969f9040d1b18ec9967dbf0cf28fc473b36377279f03b5ee782f8e619c9ee0e89087a540ac61a7a27fc8757113f15c7acd3153bc6b6e5fd427174fc7
-  data.tar.gz: 816d97d2bb91349bbe9b134d5d1b8425910fa1507c76db0743e5d9921bf74c1446e5939a50b34b1116bf0712a279af5766587ec771557ba66f29527f2f719b92
+  metadata.gz: 5e1661985c5bee26475a35424fe78b81b462ea90e41984623bb13c3621626a241b3107390c69ddae177b6b6a3fdb49886e647bc4f2650ee1a9a308e62f7350a8
+  data.tar.gz: 36b1a2729315cec0e2b3805dea9dd05c888b9a2795b9e57e34f7a454e98c589705bcf5ad2bb5df2c022e1b3dc6d8b7388d9b5ea4bd6dd04a9412ceb4e2b1a2d9

data/.devcontainer/devcontainer.json ADDED Viewed

@@ -0,0 +1,31 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/ruby
+{
+	"name": "Ruby",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/ruby:0-3.1-bullseye",
+	"features": {
+		"ghcr.io/devcontainers/features/git:1": {
+			"ppa": true,
+			"version": "os-provided"
+		}
+	},
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				// Add the IDs of extensions you want installed when the container is created.
+				"rebornix.Ruby"
+			]
+		}
+	}
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+	// Use 'postCreateCommand' to run commands after the container is created.
+	// "postCreateCommand": "ruby --version",
+	// Configure tool-specific properties.
+	// "customizations": {},
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}

data/Gemfile.lock CHANGED Viewed

@@ -1,35 +1,39 @@
 PATH
   remote: .
   specs:
-    mini_search (1.0.0)
+    mini_search (1.0.4)
+      ruby_ngrams (~> 0.0.6)
 GEM
   remote: https://rubygems.org/
   specs:
-    diff-lcs (1.3)
-    rake (10.5.0)
-    rspec (3.8.0)
-      rspec-core (~> 3.8.0)
-      rspec-expectations (~> 3.8.0)
-      rspec-mocks (~> 3.8.0)
-    rspec-core (3.8.0)
-      rspec-support (~> 3.8.0)
-    rspec-expectations (3.8.1)
+    diff-lcs (1.5.0)
+    rake (12.3.3)
+    rspec (3.12.0)
+      rspec-core (~> 3.12.0)
+      rspec-expectations (~> 3.12.0)
+      rspec-mocks (~> 3.12.0)
+    rspec-core (3.12.1)
+      rspec-support (~> 3.12.0)
+    rspec-expectations (3.12.2)
       diff-lcs (>= 1.2.0, < 2.0)
-      rspec-support (~> 3.8.0)
-    rspec-mocks (3.8.0)
+      rspec-support (~> 3.12.0)
+    rspec-mocks (3.12.5)
       diff-lcs (>= 1.2.0, < 2.0)
-      rspec-support (~> 3.8.0)
-    rspec-support (3.8.0)
+      rspec-support (~> 3.12.0)
+    rspec-support (3.12.0)
+    ruby_cli (0.2.1)
+    ruby_ngrams (0.0.6)
+      ruby_cli (>= 0.2.0)
 PLATFORMS
-  ruby
+  x86_64-linux
 DEPENDENCIES
-  bundler (~> 1.16)
+  bundler (~> 2.4.10)
   mini_search!
-  rake (~> 10.0)
+  rake (~> 12.0)
   rspec (~> 3.0)
 BUNDLED WITH
-   1.16.4
+   2.4.10

data/README.md CHANGED Viewed

@@ -20,6 +20,36 @@ Or install it yourself as:
     $ gem install mini_search
+## BM25 (from wikipedia)
+BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document, regardless
+of their proximity within the document. It is a family of scoring functions with slightly different components and parameters.
+One of the most prominent instantiations of the function is as follows.
+Given a query Q, containing keywords `q1....qn` the BM25 score of a document `D` is:
+![BM25 Formula](formula1.svg)
+where `f(qi, D)` is qi's term frequency (tf) in the document `D`, `|D|` is the length of the document `D` in words, and avgdl is the
+average document length in the text collection from which documents are drawn. `k1` and `b` are free parameters, usually chosen, in absence of
+an advanced optimization, as `k1 in |1.2,2.0|` and `b = 0.75`. `IDF(qi)` is the IDF (inverse document frequency) weight of the query term
+`qi`. It is usually computed as:
+![IDF Formula](formula2.svg)
+where `N` is the total number of documents in the collection, and `n(q)` is the number of documents containing `qi`.
+There are several interpretations for IDF and slight variations on its formula. In the original BM25 derivation,
+the IDF component is derived from the Binary Independence Model.
+The above formula for IDF has drawbacks for terms appearing in more than half of the corpus documents. These terms' IDF is negative,
+so for any two almost-identical documents, one which contains the term may be ranked lower than one which does not. This is often an
+undesirable behavior, so many applications adjust the IDF formula in various ways:
+Each summand can be given a floor of 0, to trim out common terms;
+The IDF function can be given a floor of a constant `e`, to avoid common terms being ignored at all;
+The IDF function can be replaced with a similarly shaped one which is non-negative, or strictly positive to avoid terms being ignored at all.
 ## Inverted Index
 MiniSearch implements a inverted index (basically a hashmap where terms are keys and values are documents that contains that key.
@@ -165,42 +195,6 @@ With this changes our index would be:
 Pretty better now, we could apply other steps like removing some words that are irrelevant for us (stop words),
 add synonyms for some words but this other changes are specifics from languages.
-TODO
-## Language support (stop words, stemmers)
-TODO
-## BM25 (from wikipedia)
-BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document, regardless
-of their proximity within the document. It is a family of scoring functions with slightly different components and parameters.
-One of the most prominent instantiations of the function is as follows.
-Given a query Q, containing keywords `q1....qn` the BM25 score of a document `D` is:
-![BM25 Formula](formula1.svg)
-where `f(qi, D)` is qi's term frequency (tf) in the document `D`, `|D|` is the length of the document `D` in words, and avgdl is the
-average document length in the text collection from which documents are drawn. `k1` and `b` are free parameters, usually chosen, in absence of
-an advanced optimization, as `k1 in |1.2,2.0|` and `b = 0.75`. `IDF(qi)` is the IDF (inverse document frequency) weight of the query term
-`qi`. It is usually computed as:
-![IDF Formula](formula2.svg)
-where `N` is the total number of documents in the collection, and `n(q)` is the number of documents containing `qi`.
-There are several interpretations for IDF and slight variations on its formula. In the original BM25 derivation,
-the IDF component is derived from the Binary Independence Model.
-The above formula for IDF has drawbacks for terms appearing in more than half of the corpus documents. These terms' IDF is negative,
-so for any two almost-identical documents, one which contains the term may be ranked lower than one which does not. This is often an
-undesirable behavior, so many applications adjust the IDF formula in various ways:
-Each summand can be given a floor of 0, to trim out common terms;
-The IDF function can be given a floor of a constant `e`, to avoid common terms being ignored at all;
-The IDF function can be replaced with a similarly shaped one which is non-negative, or strictly positive to avoid terms being ignored at all.
 ## Usage
 First we create an inverted Index
@@ -248,6 +242,92 @@ First we create an inverted Index
 We can see results are sorted by score, notice that the document we index can have any other
 fields like name, price and etc. But only `:id` and `:indexed_field` are required
+## Language support (stop words, stemmers)
+Creating an index using `MiniSearch.new_index` will gives an inverted_index that does not
+have any language support like stop_words and synonyms. We could pass them as arguments
+in `new_index` like:
+```
+index = MiniSearch.new_index(
+  stop_words: stop_words,
+  stemmer: stemmer,
+  synonyms_map: synonyms_map
+)
+```
+Arguments:
+  - The stop_words is a array of worlds that should be removed when indexing the document.
+  - The stemmer is a object of type Stemmer, that implements a `stem` method that remove all but the stem of the word (example: `carrocha` -> `carr`).
+  - The synonyms_map is a hashmap with original terms and a list of synonyms (example: `{'calçado' => ['sapato', 'tenis', 'salto', 'chinelo]}`)
+## n-gram Tokenizer
+By default creating an index using `MiniSearch.new_index` will gives an inverted_index that uses a simple whitespace tokenizer.
+(e.g. `"Hello World" => ["Hello", "World"]`)
+You can change this behavior to use an n-gram tokenizer which will break words down into smaller pieces with a configurable
+token window. You can read more about how this kind of tokenization works for [Elastic Search](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html).
+(e.g. `ngrams: 2` the phrase `"Hello World" => ["He", "el", "ll", "lo", "o ", " W", "Wo", "or", "rl", "ld"]`)
+or
+(e.g. `ngrams: 3` the phrase `"Hello World" => ["Hel", "ell", "llo", "lo ", "o W", " Wo", "Wor", "orl", "rld"]`)
+To enable this simply pass an integer for the parameter `ngrams`.
+```
+index = MiniSearch.new_index(
+  ngrams: 2,
+)
+```
+Arguments:
+- ngrams: An integer which represents the amount of characters each token should be. Common paramaeters are: (`2` for bigrams or `3` for trigrams)
+# Stemmers
+Stemmers are classes that implements the `def stem(word)` method, that receives a word and returs the stem:
+Example of a NaiveEnglishStemmer:
+```
+module MiniSearch
+  module Stemmer
+    class NaiveEnglishStemmer
+      def stem(word)
+        # removes plural
+        word[0..-2] if word.end_with?('s')
+      end
+    end
+  end
+end
+```
+MiniSearch comes with a Brazilian Portuguese stemmer for now.
+## Configuring multiple cores using yaml
+You can configure a multiple core using a yaml config file.
+```yaml
+cores:
+  - main:
+    lang: 'pt'
+    synonyms_map:
+      bebe: 'nene'
+    stop_words:
+      - 'de'
+      - 'para'
+  - aux:
+    lang: 'pt'
+    synonyms_map:
+      bebe: 'nene'
+    stop_words:
+      - 'de'
+      - 'para'
+```
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/bin/console CHANGED Viewed

@@ -2,6 +2,7 @@
 require "bundler/setup"
 require "mini_search"
+require "ruby_ngrams" # We want to be able to use this when testing!
 # You can add fixtures and/or initialization code here to make experimenting
 # with your gem easier. You can also use a different console, if you like.

data/lib/mini_search/compound_tokenizer.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+module MiniSearch
+  class CompoundTokenizer
+    def initialize(tokenizers)
+      @tokenizers = tokenizers
+    end
+    def execute(string)
+      @tokenizers.each_with_object([]) do |tokenizer, tokens|
+        tokens.concat(tokenizer.execute(string))
+      end
+    end
+  end
+end

data/lib/mini_search/inverted_index.rb CHANGED Viewed

@@ -154,8 +154,8 @@ module MiniSearch
     def generate_idfs(processed_terms)
       processed_terms.each_with_object({}) do |term, idfs|
-        if @index[term].to_a.any?
-          idfs[term] = Idf.calculate(@index[term].size, @documents.size)
+        if @inverted_index[term].to_a.any?
+          idfs[term] = Idf.calculate(@inverted_index[term].size, @documents.size)
         end
       end
     end

data/lib/mini_search/ngram_tokenizer.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+require "ruby_ngrams"
+module MiniSearch
+  class NgramTokenizer
+    def initialize(n)
+      @n = n || 2
+    end
+    def execute(string)
+      # In the future, we may want to consider doing a strip on tokens to remove whitespace.
+      string.ngrams(regex: //, n: @n).map(&:join)
+    end
+  end
+end

data/lib/mini_search/pipeline.rb CHANGED Viewed

@@ -5,16 +5,27 @@ module MiniSearch
   # do when indexing a document or searching
   class Pipeline
     def initialize(tokenizer, filters)
+      @standard_tokenizer = MiniSearch::StandardWhitespaceTokenizer.new
       @tokenizer = tokenizer
       @filters = filters
     end
     def execute(string)
-      tokens = @tokenizer.execute(string)
+      # Since the filter model expects tokens that are tokenized by
+      # the standard tokenizer, let's use that first.
+      tokens = @standard_tokenizer.execute(string)
-      @filters.reduce(tokens) do |filtered_tokens, filter|
+      # Apply filters
+      filters_applied = @filters.reduce(tokens) do |filtered_tokens, filter|
         filter.execute(filtered_tokens)
       end
+      # Return if our selected tokenizer is the standard tokenizer
+      return filters_applied if @tokenizer.is_a? MiniSearch::StandardWhitespaceTokenizer
+      # Execute non-standard tokenization after rejoining the tokens
+      # that were tokenized with the StandardWhitespaceTokenizer
+      @tokenizer.execute(filters_applied.join(' '))
     end
   end
 end

data/lib/mini_search/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MiniSearch
-  VERSION = "1.0.3"
+  VERSION = "1.2.0"
 end

data/lib/mini_search.rb CHANGED Viewed

@@ -1,6 +1,9 @@
+require 'yaml'
 require 'mini_search/version.rb'
 require 'mini_search/stemmer/portuguese.rb'
 require 'mini_search/standard_whitespace_tokenizer.rb'
+require 'mini_search/ngram_tokenizer.rb'
+require 'mini_search/compound_tokenizer.rb'
 require 'mini_search/strip_filter.rb'
 require 'mini_search/remove_punctuation_filter.rb'
 require 'mini_search/downcase_filter.rb'
@@ -23,8 +26,13 @@ module MiniSearch
     MiniSearch::InvertedIndex.new(indexing_pipeline, querying_pipeline)
   end
-  def self.new_index(stop_words: [], synonyms_map: {}, stemmer: nil)
-    standard_whitespace_tokenizer = StandardWhitespaceTokenizer.new
+  def self.new_index(stop_words: [], synonyms_map: {}, stemmer: nil, ngrams: nil)
+    tokenizer =
+      if ngrams
+        NgramTokenizer.new(ngrams)
+      else
+        StandardWhitespaceTokenizer.new
+      end
     strip_filter = StripFilter.new
     remove_punctuation_filter = RemovePunctuationFilter.new
@@ -34,7 +42,7 @@ module MiniSearch
     synonyms_filter = SynonymsFilter.new(synonyms_map)
     indexing_pipeline = Pipeline.new(
-      standard_whitespace_tokenizer,
+      tokenizer,
       [
         strip_filter,
         remove_punctuation_filter,
@@ -45,7 +53,7 @@ module MiniSearch
     )
     querying_pipeline = Pipeline.new(
-      standard_whitespace_tokenizer,
+      tokenizer,
       [
         strip_filter,
         remove_punctuation_filter,
@@ -59,17 +67,40 @@ module MiniSearch
     new(indexing_pipeline, querying_pipeline)
   end
-  def self.new_localized_index(language_support, synonyms_map: {}, stop_words: [])
-    if language_support.is_a?(Symbol)
-      language_support = LANGUAGE_SUPPORTS[language_support].new(stop_words)
-    end
-    raise 'language support not found or nil' unless language_support
+  def self.new_localized_index(lang, synonyms_map: {}, stop_words: [], ngrams: nil)
+    language_support = find_language_support(lang, stop_words)
     new_index(
       stop_words: language_support.stop_words,
       stemmer: language_support.stemmer,
-      synonyms_map: synonyms_map
+      synonyms_map: synonyms_map,
+      ngrams: ngrams
     )
   end
+  def self.from_config_file(file)
+    raise "file not found '#{file}'" unless File.exists?(file)
+    cores = YAML.load_file(file)['cores']
+    cores.map do |core|
+      lang = core['lang'].to_sym
+      new_localized_index(
+        lang,
+        stop_words: core['stop_words'],
+        synonyms_map: core['synonyms_map'].transform_values { |v| v.split(',') }
+      )
+    end
+  end
+  private_class_method def self.find_language_support(lang, stop_words)
+    if lang.is_a?(Symbol)
+      language_support = LANGUAGE_SUPPORTS[lang].new(stop_words)
+    end
+    raise 'language support not found or nil' unless language_support
+    language_support
+  end
 end

data/mini_search.gemspec CHANGED Viewed

@@ -23,7 +23,8 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
-  spec.add_development_dependency "bundler", "~> 1.16"
-  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "bundler", "~> 2.4.10"
+  spec.add_development_dependency "rake", "~> 12.0"
   spec.add_development_dependency "rspec", "~> 3.0"
+  spec.add_runtime_dependency "ruby_ngrams", "~> 0.0.6"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: mini_search
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.2.0
 platform: ruby
 authors:
 - Andrew S Aguiar
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-09-28 00:00:00.000000000 Z
+date: 2023-05-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -16,28 +16,28 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.16'
+        version: 2.4.10
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.16'
+        version: 2.4.10
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: '12.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: '12.0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -52,6 +52,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: ruby_ngrams
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.0.6
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.0.6
 description: In-memory naive search engine.
 email:
 - andrewaguiar6@gmail.com
@@ -59,6 +73,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".devcontainer/devcontainer.json"
 - ".gitignore"
 - ".rspec"
 - ".travis.yml"
@@ -74,11 +89,13 @@ files:
 - formula2.svg
 - lib/mini_search.rb
 - lib/mini_search/bm_25.rb
+- lib/mini_search/compound_tokenizer.rb
 - lib/mini_search/downcase_filter.rb
 - lib/mini_search/idf.rb
 - lib/mini_search/inverted_index.rb
 - lib/mini_search/inverted_index_spec.rb
 - lib/mini_search/language_support/portuguese.rb
+- lib/mini_search/ngram_tokenizer.rb
 - lib/mini_search/pipeline.rb
 - lib/mini_search/remove_punctuation_filter.rb
 - lib/mini_search/standard_whitespace_tokenizer.rb
@@ -94,7 +111,7 @@ homepage: https://www.github.com/andrewaguiar/mini_search
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -109,8 +126,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
-signing_key:
+rubygems_version: 3.1.6
+signing_key:
 specification_version: 4
 summary: In-memory naive search engine.
 test_files: []