known_item_search_classifier 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 2 
     | 
    
         
            +
            SHA256:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
         
     | 
| 
         @@ -3,52 +3,54 @@ require 'csv' 
     | 
|
| 
       3 
3 
     | 
    
         
             
            require 'gaussian_naive_bayes'
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
       5 
5 
     | 
    
         
             
            module KnownItemSearchClassifier
         
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
             
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
      
 6 
     | 
    
         
            +
              class Classifier
         
     | 
| 
      
 7 
     | 
    
         
            +
                def initialize
         
     | 
| 
      
 8 
     | 
    
         
            +
                  set = DefaultTrainingSet.new
         
     | 
| 
      
 9 
     | 
    
         
            +
                  @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries,
         
     | 
| 
      
 10 
     | 
    
         
            +
                                                                             set.categories_probabilities
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                def is_known_item_search?(query_string)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  classify query_string
         
     | 
| 
      
 15 
     | 
    
         
            +
                end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                def train(training_set)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  @custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
         
     | 
| 
      
 19 
     | 
    
         
            +
                  training_set.each do |query|
         
     | 
| 
      
 20 
     | 
    
         
            +
                    submit_vector query
         
     | 
| 
      
 21 
     | 
    
         
            +
                  end
         
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                def train_from_csv(filename)
         
     | 
| 
      
 25 
     | 
    
         
            +
                  @custom_training_set = GaussianNaiveBayes::Learner.new unless defined? @custom_training_set
         
     | 
| 
      
 26 
     | 
    
         
            +
                  csv = ::CSV.read(filename)
         
     | 
| 
      
 27 
     | 
    
         
            +
                  csv.each do |line|
         
     | 
| 
      
 28 
     | 
    
         
            +
                    submit_vector line
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
       31 
31 
     | 
    
         | 
| 
       32 
32 
     | 
    
         
             
                private
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
             
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
       50 
     | 
    
         
            -
             
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
             
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                attr_reader :custom_tr
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                def classify(string)
         
     | 
| 
      
 37 
     | 
    
         
            +
                  f = FeatureExtractor.new string
         
     | 
| 
      
 38 
     | 
    
         
            +
                  feature_array = f.feature_array
         
     | 
| 
      
 39 
     | 
    
         
            +
                  if defined? @custom_training_set
         
     | 
| 
      
 40 
     | 
    
         
            +
                    classifier = @custom_training_set.classifier
         
     | 
| 
      
 41 
     | 
    
         
            +
                    query_class = classifier.classify(feature_array)
         
     | 
| 
      
 42 
     | 
    
         
            +
                  else
         
     | 
| 
      
 43 
     | 
    
         
            +
                    query_class = @default_training_set.classify(feature_array)
         
     | 
| 
      
 44 
     | 
    
         
            +
                  end
         
     | 
| 
      
 45 
     | 
    
         
            +
                  return query_class
         
     | 
| 
      
 46 
     | 
    
         
            +
                  return true if :known == query_class
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                  false
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                def submit_vector(arr)
         
     | 
| 
      
 52 
     | 
    
         
            +
                  f = FeatureExtractor.new arr[0]
         
     | 
| 
      
 53 
     | 
    
         
            +
                  @custom_training_set.train f.feature_array, arr[1]
         
     | 
| 
       53 
54 
     | 
    
         
             
                end
         
     | 
| 
      
 55 
     | 
    
         
            +
              end
         
     | 
| 
       54 
56 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,23 +1,24 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module KnownItemSearchClassifier
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
             
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
      
 2 
     | 
    
         
            +
              class DefaultTrainingSet
         
     | 
| 
      
 3 
     | 
    
         
            +
                attr_reader :categories_probabilities, :categories_summaries
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
                def initialize
         
     | 
| 
      
 6 
     | 
    
         
            +
                  @categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
         
     | 
| 
      
 7 
     | 
    
         
            +
                  @categories_summaries =
         
     | 
| 
      
 8 
     | 
    
         
            +
                    { 'known' =>
         
     | 
| 
      
 9 
     | 
    
         
            +
                      { 0 => { mean: 0.6, standard_deviation: 0.5 },
         
     | 
| 
      
 10 
     | 
    
         
            +
                        1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
         
     | 
| 
      
 11 
     | 
    
         
            +
                        2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
         
     | 
| 
      
 12 
     | 
    
         
            +
                        3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
         
     | 
| 
      
 13 
     | 
    
         
            +
                        4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
         
     | 
| 
      
 14 
     | 
    
         
            +
                        5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
         
     | 
| 
      
 15 
     | 
    
         
            +
                      'unknown' =>
         
     | 
| 
      
 16 
     | 
    
         
            +
                      { 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
         
     | 
| 
      
 17 
     | 
    
         
            +
                        1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
         
     | 
| 
      
 18 
     | 
    
         
            +
                        2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
         
     | 
| 
      
 19 
     | 
    
         
            +
                        3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
         
     | 
| 
      
 20 
     | 
    
         
            +
                        4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
         
     | 
| 
      
 21 
     | 
    
         
            +
                        5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
         
     | 
| 
       22 
22 
     | 
    
         
             
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
              end
         
     | 
| 
       23 
24 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,73 +1,75 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require 'engtagger'
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            module KnownItemSearchClassifier
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
                        
         
     | 
| 
       20 
     | 
    
         
            -
                    end
         
     | 
| 
       21 
     | 
    
         
            -
                    def feature_array
         
     | 
| 
       22 
     | 
    
         
            -
                        return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
         
     | 
| 
       23 
     | 
    
         
            -
                    end
         
     | 
| 
       24 
     | 
    
         
            -
                    private
         
     | 
| 
       25 
     | 
    
         
            -
                    def is_mixed_case?
         
     | 
| 
       26 
     | 
    
         
            -
                        if @string =~ /[A-Z]/ and @string =~ /[a-z]/
         
     | 
| 
       27 
     | 
    
         
            -
                            return 1.0
         
     | 
| 
       28 
     | 
    
         
            -
                        end 
         
     | 
| 
       29 
     | 
    
         
            -
                        return 0.0
         
     | 
| 
       30 
     | 
    
         
            -
                    end
         
     | 
| 
       31 
     | 
    
         
            -
                    def punctuation_ratio
         
     | 
| 
       32 
     | 
    
         
            -
                        num_punct = @tagged.scan(/\/PP/).size.to_f
         
     | 
| 
       33 
     | 
    
         
            -
                        return num_punct / @num_words
         
     | 
| 
       34 
     | 
    
         
            -
                    end
         
     | 
| 
       35 
     | 
    
         
            -
                    def determiner_ratio
         
     | 
| 
       36 
     | 
    
         
            -
                        num_det = @tagged.scan(/\/DET/).size.to_f
         
     | 
| 
       37 
     | 
    
         
            -
                        return num_det / @num_words
         
     | 
| 
       38 
     | 
    
         
            -
                    end
         
     | 
| 
       39 
     | 
    
         
            -
                    def numeric_count
         
     | 
| 
       40 
     | 
    
         
            -
                        return @string.scan(/[0-9]/).length
         
     | 
| 
       41 
     | 
    
         
            -
                    end
         
     | 
| 
       42 
     | 
    
         
            -
                    def proper_noun_ratio
         
     | 
| 
       43 
     | 
    
         
            -
                        num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
         
     | 
| 
       44 
     | 
    
         
            -
                        return num_prop_noun / @num_words
         
     | 
| 
       45 
     | 
    
         
            -
                    end
         
     | 
| 
       46 
     | 
    
         
            -
                    def count_keywords
         
     | 
| 
       47 
     | 
    
         
            -
                    end
         
     | 
| 
       48 
     | 
    
         
            -
                    def check_against_known_titles
         
     | 
| 
       49 
     | 
    
         
            -
                    end
         
     | 
| 
       50 
     | 
    
         
            -
                    def count_keywords
         
     | 
| 
       51 
     | 
    
         
            -
                        keywords_to_match = ['journal', 'course', 'textbook']
         
     | 
| 
       52 
     | 
    
         
            -
                        num_keywords = 0
         
     | 
| 
       53 
     | 
    
         
            -
                        @query_string.split.each do |word|
         
     | 
| 
       54 
     | 
    
         
            -
                            if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
         
     | 
| 
       55 
     | 
    
         
            -
                                num_keywords = num_keywords + 1
         
     | 
| 
       56 
     | 
    
         
            -
                            end
         
     | 
| 
       57 
     | 
    
         
            -
                        end
         
     | 
| 
       58 
     | 
    
         
            -
                        return num_keywords
         
     | 
| 
       59 
     | 
    
         
            -
                    end
         
     | 
| 
       60 
     | 
    
         
            -
                    def check_against_known_titles
         
     | 
| 
       61 
     | 
    
         
            -
                       known_titles = [
         
     | 
| 
       62 
     | 
    
         
            -
                           'fountainhead',
         
     | 
| 
       63 
     | 
    
         
            -
                           'salt sugar fat',
         
     | 
| 
       64 
     | 
    
         
            -
                       ]
         
     | 
| 
       65 
     | 
    
         
            -
                       if known_titles.include? @query_string.downcase
         
     | 
| 
       66 
     | 
    
         
            -
                           return true
         
     | 
| 
       67 
     | 
    
         
            -
                       else
         
     | 
| 
       68 
     | 
    
         
            -
                           return false
         
     | 
| 
       69 
     | 
    
         
            -
                       end
         
     | 
| 
       70 
     | 
    
         
            -
                    end
         
     | 
| 
      
 4 
     | 
    
         
            +
              class FeatureExtractor
         
     | 
| 
      
 5 
     | 
    
         
            +
                def initialize(string)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  @string = string
         
     | 
| 
      
 7 
     | 
    
         
            +
                  tagger = EngTagger.new
         
     | 
| 
      
 8 
     | 
    
         
            +
                  @tagged = tagger.get_readable string
         
     | 
| 
      
 9 
     | 
    
         
            +
                  @num_words = @tagged.scan(%r{/[A-Z]{2}}).size.to_f
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                  @mixed_case = is_mixed_case?
         
     | 
| 
      
 12 
     | 
    
         
            +
                  @punctuation_ratio = punctuation_ratio
         
     | 
| 
      
 13 
     | 
    
         
            +
                  @determiner_ratio = determiner_ratio
         
     | 
| 
      
 14 
     | 
    
         
            +
                  @proper_noun_ratio = proper_noun_ratio
         
     | 
| 
      
 15 
     | 
    
         
            +
                  @numeric_count = numeric_count
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                  # @num_keywords = count_keywords
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # @refers_to_an_item_that_is_known = check_against_known_titles
         
     | 
| 
       71 
19 
     | 
    
         
             
                end
         
     | 
| 
       72 
     | 
    
         
            -
            end
         
     | 
| 
       73 
20 
     | 
    
         | 
| 
      
 21 
     | 
    
         
            +
                def feature_array
         
     | 
| 
      
 22 
     | 
    
         
            +
                  [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words, @numeric_count]
         
     | 
| 
      
 23 
     | 
    
         
            +
                end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                private
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                def is_mixed_case?
         
     | 
| 
      
 28 
     | 
    
         
            +
                  return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                  0.0
         
     | 
| 
      
 31 
     | 
    
         
            +
                end
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                def punctuation_ratio
         
     | 
| 
      
 34 
     | 
    
         
            +
                  num_punct = @tagged.scan(%r{/PP}).size.to_f
         
     | 
| 
      
 35 
     | 
    
         
            +
                  num_punct / @num_words
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                def determiner_ratio
         
     | 
| 
      
 39 
     | 
    
         
            +
                  num_det = @tagged.scan(%r{/DET}).size.to_f
         
     | 
| 
      
 40 
     | 
    
         
            +
                  num_det / @num_words
         
     | 
| 
      
 41 
     | 
    
         
            +
                end
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
                def numeric_count
         
     | 
| 
      
 44 
     | 
    
         
            +
                  @string.scan(/[0-9]/).length
         
     | 
| 
      
 45 
     | 
    
         
            +
                end
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                def proper_noun_ratio
         
     | 
| 
      
 48 
     | 
    
         
            +
                  num_prop_noun = @tagged.scan(%r{/NNP}).size.to_f
         
     | 
| 
      
 49 
     | 
    
         
            +
                  num_prop_noun / @num_words
         
     | 
| 
      
 50 
     | 
    
         
            +
                end
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                def count_keywords; end
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                def check_against_known_titles; end
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                def count_keywords
         
     | 
| 
      
 57 
     | 
    
         
            +
                  keywords_to_match = %w[journal course textbook]
         
     | 
| 
      
 58 
     | 
    
         
            +
                  num_keywords = 0
         
     | 
| 
      
 59 
     | 
    
         
            +
                  @query_string.split.each do |word|
         
     | 
| 
      
 60 
     | 
    
         
            +
                    num_keywords += 1 if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
                  num_keywords
         
     | 
| 
      
 63 
     | 
    
         
            +
                end
         
     | 
| 
      
 64 
     | 
    
         
            +
             
     | 
| 
      
 65 
     | 
    
         
            +
                def check_against_known_titles
         
     | 
| 
      
 66 
     | 
    
         
            +
                  known_titles = [
         
     | 
| 
      
 67 
     | 
    
         
            +
                    'fountainhead',
         
     | 
| 
      
 68 
     | 
    
         
            +
                    'salt sugar fat'
         
     | 
| 
      
 69 
     | 
    
         
            +
                  ]
         
     | 
| 
      
 70 
     | 
    
         
            +
                  return true if known_titles.include? @query_string.downcase
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
                  false
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
              end
         
     | 
| 
      
 75 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -1,62 +1,49 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            require 'coveralls'
         
     | 
| 
       2 
     | 
    
         
            -
            Coveralls.wear!
         
     | 
| 
       3 
1 
     | 
    
         
             
            require 'minitest/autorun'
         
     | 
| 
       4 
2 
     | 
    
         
             
            require './lib/known_item_search_classifier'
         
     | 
| 
       5 
3 
     | 
    
         | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
4 
     | 
    
         
             
            class KnownItemSearchClassifierTest < Minitest::Test
         
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
      
 5 
     | 
    
         
            +
              classifier = KnownItemSearchClassifier::Classifier.new
         
     | 
| 
       9 
6 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
             
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
                 
     | 
| 
       24 
     | 
    
         
            -
                 
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
      
 7 
     | 
    
         
            +
              known_item_training_set = [
         
     | 
| 
      
 8 
     | 
    
         
            +
                # 'hobbit first edition',  -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 9 
     | 
    
         
            +
                # 'my soul is rested',  -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 10 
     | 
    
         
            +
                # 'new yorker',  -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 11 
     | 
    
         
            +
                # 'when harry met sally', -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 12 
     | 
    
         
            +
                # '"neo tekunoroji"',  -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 13 
     | 
    
         
            +
                '99131236427206421',
         
     | 
| 
      
 14 
     | 
    
         
            +
                'A decision making model for selecting start-up businesses in a government venture capital scheme',
         
     | 
| 
      
 15 
     | 
    
         
            +
                # 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 16 
     | 
    
         
            +
                # 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 17 
     | 
    
         
            +
                # 'salt sugar fat', -- classifier incorrectly classifies this as unknown
         
     | 
| 
      
 18 
     | 
    
         
            +
                'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction.  Viking. 2015. Print',
         
     | 
| 
      
 19 
     | 
    
         
            +
                'the inconvenient truth',
         
     | 
| 
      
 20 
     | 
    
         
            +
                'Polarization: What Everyone Needs to Know',
         
     | 
| 
      
 21 
     | 
    
         
            +
                'little house on the'
         
     | 
| 
      
 22 
     | 
    
         
            +
              ]
         
     | 
| 
      
 23 
     | 
    
         
            +
              known_item_training_set.each do |query|
         
     | 
| 
      
 24 
     | 
    
         
            +
                cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
         
     | 
| 
      
 25 
     | 
    
         
            +
                define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
         
     | 
| 
      
 26 
     | 
    
         
            +
                  assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
         
     | 
| 
       29 
27 
     | 
    
         
             
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
       30 
29 
     | 
    
         | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
       37 
     | 
    
         
            -
             
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
             
     | 
| 
       43 
     | 
    
         
            -
             
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
                    "Professional baking ",
         
     | 
| 
       49 
     | 
    
         
            -
                    "concussions after the nfl",
         
     | 
| 
       50 
     | 
    
         
            -
                    "IVF the US",
         
     | 
| 
       51 
     | 
    
         
            -
                    "adoption children the US",
         
     | 
| 
       52 
     | 
    
         
            -
                    "Films for the hearing impaired",
         
     | 
| 
       53 
     | 
    
         
            -
                    "wolves and the ecosystem",
         
     | 
| 
       54 
     | 
    
         
            -
                    "dr. martin luther king",
         
     | 
| 
       55 
     | 
    
         
            -
                ]
         
     | 
| 
       56 
     | 
    
         
            -
                unknown_item_training_set.each do |query|
         
     | 
| 
       57 
     | 
    
         
            -
                    cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
         
     | 
| 
       58 
     | 
    
         
            -
                    define_method("test_#{cleaned_up_query}_is_not_false_positive") do
         
     | 
| 
       59 
     | 
    
         
            -
                        assert_equal(:unknown, classifier.is_known_item_search?(query))
         
     | 
| 
       60 
     | 
    
         
            -
                    end
         
     | 
| 
      
 30 
     | 
    
         
            +
              unknown_item_training_set = [
         
     | 
| 
      
 31 
     | 
    
         
            +
                'colonial mexico textiles',
         
     | 
| 
      
 32 
     | 
    
         
            +
                'history of horses',
         
     | 
| 
      
 33 
     | 
    
         
            +
                'medical expertise COVID',
         
     | 
| 
      
 34 
     | 
    
         
            +
                'music and sexuality',
         
     | 
| 
      
 35 
     | 
    
         
            +
                'paper industry',
         
     | 
| 
      
 36 
     | 
    
         
            +
                'sun ra',
         
     | 
| 
      
 37 
     | 
    
         
            +
                # 'concussions after the nfl', -- classifier incorrectly classifies this as known
         
     | 
| 
      
 38 
     | 
    
         
            +
                'Professional baking ',
         
     | 
| 
      
 39 
     | 
    
         
            +
                'Manos chatzidakis',
         
     | 
| 
      
 40 
     | 
    
         
            +
                'whey protein',
         
     | 
| 
      
 41 
     | 
    
         
            +
                'benefits of eating healthyhy'
         
     | 
| 
      
 42 
     | 
    
         
            +
              ]
         
     | 
| 
      
 43 
     | 
    
         
            +
              unknown_item_training_set.each do |query|
         
     | 
| 
      
 44 
     | 
    
         
            +
                cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
         
     | 
| 
      
 45 
     | 
    
         
            +
                define_method("test_#{cleaned_up_query}_is_not_false_positive") do
         
     | 
| 
      
 46 
     | 
    
         
            +
                  assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
         
     | 
| 
       61 
47 
     | 
    
         
             
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
              end
         
     | 
| 
       62 
49 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: known_item_search_classifier
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.3.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Jane Sandberg
         
     | 
| 
       8 
     | 
    
         
            -
            autorequire: 
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2024-11-06 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: engtagger
         
     | 
| 
         @@ -53,21 +53,35 @@ dependencies: 
     | 
|
| 
       53 
53 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       54 
54 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       55 
55 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       56 
     | 
    
         
            -
              name:  
     | 
| 
      
 56 
     | 
    
         
            +
              name: rake
         
     | 
| 
       57 
57 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       58 
58 
     | 
    
         
             
                requirements:
         
     | 
| 
       59 
     | 
    
         
            -
                - -  
     | 
| 
      
 59 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 60 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 61 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 62 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 63 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 64 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 65 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 66 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 67 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 68 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 69 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 70 
     | 
    
         
            +
              name: rubocop
         
     | 
| 
      
 71 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 72 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 73 
     | 
    
         
            +
                - - ">="
         
     | 
| 
       60 
74 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       61 
     | 
    
         
            -
                    version: 0 
     | 
| 
      
 75 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
       62 
76 
     | 
    
         
             
              type: :development
         
     | 
| 
       63 
77 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       64 
78 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       65 
79 
     | 
    
         
             
                requirements:
         
     | 
| 
       66 
     | 
    
         
            -
                - -  
     | 
| 
      
 80 
     | 
    
         
            +
                - - ">="
         
     | 
| 
       67 
81 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       68 
     | 
    
         
            -
                    version: 0 
     | 
| 
      
 82 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
       69 
83 
     | 
    
         
             
            description: Classify search query strings
         
     | 
| 
       70 
     | 
    
         
            -
            email: 
     | 
| 
      
 84 
     | 
    
         
            +
            email:
         
     | 
| 
       71 
85 
     | 
    
         
             
            executables: []
         
     | 
| 
       72 
86 
     | 
    
         
             
            extensions: []
         
     | 
| 
       73 
87 
     | 
    
         
             
            extra_rdoc_files: []
         
     | 
| 
         @@ -81,7 +95,7 @@ homepage: https://github.com/sandbergja/known_item_search_classifier 
     | 
|
| 
       81 
95 
     | 
    
         
             
            licenses:
         
     | 
| 
       82 
96 
     | 
    
         
             
            - MIT
         
     | 
| 
       83 
97 
     | 
    
         
             
            metadata: {}
         
     | 
| 
       84 
     | 
    
         
            -
            post_install_message: 
     | 
| 
      
 98 
     | 
    
         
            +
            post_install_message:
         
     | 
| 
       85 
99 
     | 
    
         
             
            rdoc_options: []
         
     | 
| 
       86 
100 
     | 
    
         
             
            require_paths:
         
     | 
| 
       87 
101 
     | 
    
         
             
            - lib
         
     | 
| 
         @@ -89,16 +103,15 @@ required_ruby_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       89 
103 
     | 
    
         
             
              requirements:
         
     | 
| 
       90 
104 
     | 
    
         
             
              - - ">="
         
     | 
| 
       91 
105 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       92 
     | 
    
         
            -
                  version:  
     | 
| 
      
 106 
     | 
    
         
            +
                  version: 3.0.0
         
     | 
| 
       93 
107 
     | 
    
         
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
       94 
108 
     | 
    
         
             
              requirements:
         
     | 
| 
       95 
109 
     | 
    
         
             
              - - ">="
         
     | 
| 
       96 
110 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       97 
111 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       98 
112 
     | 
    
         
             
            requirements: []
         
     | 
| 
       99 
     | 
    
         
            -
             
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
            signing_key: 
         
     | 
| 
      
 113 
     | 
    
         
            +
            rubygems_version: 3.5.16
         
     | 
| 
      
 114 
     | 
    
         
            +
            signing_key:
         
     | 
| 
       102 
115 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       103 
116 
     | 
    
         
             
            summary: A ruby gem that classifies search query strings as either known-item searches
         
     | 
| 
       104 
117 
     | 
    
         
             
              or unknown-item searches
         
     |