known_item_search_classifier 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/known_item_search_classifier/classifier.rb +53 -0
- data/lib/known_item_search_classifier/default_training_set.rb +21 -0
- data/lib/known_item_search_classifier/feature_extractor.rb +68 -0
- data/lib/known_item_search_classifier.rb +3 -0
- data/test/known_item_search_classifier_test.rb +62 -0
- metadata +106 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: df03a8dc0661439a41c7d3366a49573f463a176f
         | 
| 4 | 
            +
              data.tar.gz: 58713a770bba4a173adc4e88bb507f256b4eafc5
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 470565b3b2932df41a0d02b99048746b5f5e97de476b273a9648d473d63d7e73e44d038ff2b35ffd1f0bbd413960d35df5ca1ad8e8b8902038aedb57d64225f8
         | 
| 7 | 
            +
              data.tar.gz: 7e81f8262925a653ab33c12844947f9df40b5510cf3e43d801d6a31cab7f3bf3a3de58dc9693ecbd71283ff309dc9bdb6490d1a6be25fe436726ffc09fea28e0
         | 
| @@ -0,0 +1,53 @@ | |
| 1 | 
            +
            # Classifies search strings as either known-item searches or unknown-item searches
         | 
| 2 | 
            +
            require 'gaussian_naive_bayes'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module KnownItemSearchClassifier
         | 
| 5 | 
            +
                class Classifier
         | 
| 6 | 
            +
                    def initialize
         | 
| 7 | 
            +
                        set = DefaultTrainingSet.new
         | 
| 8 | 
            +
                        @default_training_set = GaussianNaiveBayes::Classifier.new set.categories_summaries, set.categories_probabilities
         | 
| 9 | 
            +
                    end
         | 
| 10 | 
            +
                    def is_known_item_search? query_string
         | 
| 11 | 
            +
                        return classify query_string
         | 
| 12 | 
            +
                    end
         | 
| 13 | 
            +
                    def train training_set
         | 
| 14 | 
            +
                        if defined? @custom_training_set
         | 
| 15 | 
            +
                            @custom_training_set = GaussianNaiveBayes::Learner.new
         | 
| 16 | 
            +
                        end
         | 
| 17 | 
            +
                        training_set.each do |query|
         | 
| 18 | 
            +
                            submit_vector query
         | 
| 19 | 
            +
                        end
         | 
| 20 | 
            +
                    end
         | 
| 21 | 
            +
                    def train_from_csv filename
         | 
| 22 | 
            +
                        if defined? @custom_training_set
         | 
| 23 | 
            +
                            @custom_training_set = GaussianNaiveBayes::Learner.new
         | 
| 24 | 
            +
                        end
         | 
| 25 | 
            +
                        csv = CSV.read(filename)
         | 
| 26 | 
            +
                        csv.each do |line|
         | 
| 27 | 
            +
                            submit_vector line
         | 
| 28 | 
            +
                        end
         | 
| 29 | 
            +
                    end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                private
         | 
| 32 | 
            +
                    def classify string
         | 
| 33 | 
            +
                        f = FeatureExtractor.new string
         | 
| 34 | 
            +
                        feature_array = f.feature_array
         | 
| 35 | 
            +
                        if defined? @custom_training_set
         | 
| 36 | 
            +
                            classifier = @custom_training_set.classifier
         | 
| 37 | 
            +
                            query_class = classifier.classify(feature_array)
         | 
| 38 | 
            +
                        else
         | 
| 39 | 
            +
                            query_class = @default_training_set.classify(feature_array)
         | 
| 40 | 
            +
                        end
         | 
| 41 | 
            +
                        return query_class
         | 
| 42 | 
            +
                        if :known == query_class
         | 
| 43 | 
            +
                            return true
         | 
| 44 | 
            +
                        else
         | 
| 45 | 
            +
                            return false
         | 
| 46 | 
            +
                        end
         | 
| 47 | 
            +
                    end
         | 
| 48 | 
            +
                    def submit_vector arr
         | 
| 49 | 
            +
                        f = FeatureExtractor.new arr[0]
         | 
| 50 | 
            +
                        @custom_training_set.train f.feature_array, arr[1]
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
            end
         | 
| @@ -0,0 +1,21 @@ | |
| 1 | 
            +
            module KnownItemSearchClassifier
         | 
| 2 | 
            +
                class DefaultTrainingSet
         | 
| 3 | 
            +
                    attr_reader :categories_probabilities, :categories_summaries
         | 
| 4 | 
            +
                    def initialize
         | 
| 5 | 
            +
                        @categories_probabilities={:unknown=>0.78, :known=>0.22}
         | 
| 6 | 
            +
                        @categories_summaries= {
         | 
| 7 | 
            +
                            :unknown=>{
         | 
| 8 | 
            +
                                0=>{:mean=>0.2564102564102564, :standard_deviation=>0.4394771815921655},
         | 
| 9 | 
            +
                                1=>{:mean=>0.03418803418803419, :standard_deviation=>0.11344969312798027},
         | 
| 10 | 
            +
                                2=>{:mean=>0.002564102564102564, :standard_deviation=>0.0226455406828919},
         | 
| 11 | 
            +
                                3=>{:mean=>0.12991452991452992, :standard_deviation=>0.26648206508636013},
         | 
| 12 | 
            +
                                4=>{:mean=>2.7948717948717947, :standard_deviation=>2.053561836691609}},
         | 
| 13 | 
            +
                            :known=>{
         | 
| 14 | 
            +
                                0=>{:mean=>0.5454545454545454, :standard_deviation=>0.5096471914376255},
         | 
| 15 | 
            +
                                1=>{:mean=>0.051659451659451655, :standard_deviation=>0.07957404805575267},
         | 
| 16 | 
            +
                                2=>{:mean=>0.021248196248196245, :standard_deviation=>0.04412470821426937},
         | 
| 17 | 
            +
                                3=>{:mean=>0.22550505050505054, :standard_deviation=>0.2520704609787127},
         | 
| 18 | 
            +
                                4=>{:mean=>7.590909090909091, :standard_deviation=>5.770690236086651}}}
         | 
| 19 | 
            +
                    end
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
            end
         | 
| @@ -0,0 +1,68 @@ | |
| 1 | 
            +
            require 'engtagger'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module KnownItemSearchClassifier
         | 
| 4 | 
            +
                class FeatureExtractor
         | 
| 5 | 
            +
                    def initialize string
         | 
| 6 | 
            +
                        @string = string
         | 
| 7 | 
            +
                        tagger = EngTagger.new
         | 
| 8 | 
            +
                        @tagged = tagger.get_readable string
         | 
| 9 | 
            +
                        @num_words = @tagged.scan(/\/[A-Z]{2}/).size.to_f
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                        @mixed_case = is_mixed_case?
         | 
| 12 | 
            +
                        @punctuation_ratio = punctuation_ratio
         | 
| 13 | 
            +
                        @determiner_ratio = determiner_ratio
         | 
| 14 | 
            +
                        @proper_noun_ratio = proper_noun_ratio
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                        #@num_keywords = count_keywords
         | 
| 17 | 
            +
                        #@refers_to_an_item_that_is_known = check_against_known_titles
         | 
| 18 | 
            +
                        
         | 
| 19 | 
            +
                    end
         | 
| 20 | 
            +
                    def feature_array
         | 
| 21 | 
            +
                        return [@mixed_case, @punctuation_ratio, @determiner_ratio, @proper_noun_ratio, @num_words]
         | 
| 22 | 
            +
                    end
         | 
| 23 | 
            +
                    private
         | 
| 24 | 
            +
                    def is_mixed_case?
         | 
| 25 | 
            +
                        if @string =~ /[A-Z]/ and @string =~ /[a-z]/
         | 
| 26 | 
            +
                            return 1.0
         | 
| 27 | 
            +
                        end 
         | 
| 28 | 
            +
                        return 0.0
         | 
| 29 | 
            +
                    end
         | 
| 30 | 
            +
                    def punctuation_ratio
         | 
| 31 | 
            +
                        num_punct = @tagged.scan(/\/PP/).size.to_f
         | 
| 32 | 
            +
                        return num_punct / @num_words
         | 
| 33 | 
            +
                    end
         | 
| 34 | 
            +
                    def determiner_ratio
         | 
| 35 | 
            +
                        num_det = @tagged.scan(/\/DET/).size.to_f
         | 
| 36 | 
            +
                        return num_det / @num_words
         | 
| 37 | 
            +
                    end
         | 
| 38 | 
            +
                    def proper_noun_ratio
         | 
| 39 | 
            +
                        num_prop_noun = @tagged.scan(/\/NNP/).size.to_f
         | 
| 40 | 
            +
                        return num_prop_noun / @num_words
         | 
| 41 | 
            +
                    end
         | 
| 42 | 
            +
                    def count_keywords
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
                    def check_against_known_titles
         | 
| 45 | 
            +
                    end
         | 
| 46 | 
            +
                    def count_keywords
         | 
| 47 | 
            +
                        keywords_to_match = ['journal', 'course', 'textbook']
         | 
| 48 | 
            +
                        num_keywords = 0
         | 
| 49 | 
            +
                        @query_string.split.each do |word|
         | 
| 50 | 
            +
                            if keywords_to_match.include? word.gsub(/[[:punct:]]/, '').downcase
         | 
| 51 | 
            +
                                num_keywords = num_keywords + 1
         | 
| 52 | 
            +
                            end
         | 
| 53 | 
            +
                        end
         | 
| 54 | 
            +
                        return num_keywords
         | 
| 55 | 
            +
                    end
         | 
| 56 | 
            +
                    def check_against_known_titles
         | 
| 57 | 
            +
                       known_titles = [
         | 
| 58 | 
            +
                           'salt sugar fat',
         | 
| 59 | 
            +
                       ]
         | 
| 60 | 
            +
                       if known_titles.include? @query_string.downcase
         | 
| 61 | 
            +
                           return true
         | 
| 62 | 
            +
                       else
         | 
| 63 | 
            +
                           return false
         | 
| 64 | 
            +
                       end
         | 
| 65 | 
            +
                    end
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
            end
         | 
| 68 | 
            +
             | 
| @@ -0,0 +1,62 @@ | |
| 1 | 
            +
            require 'coveralls'
         | 
| 2 | 
            +
            Coveralls.wear!
         | 
| 3 | 
            +
            require 'minitest/autorun'
         | 
| 4 | 
            +
            require './lib/known_item_search_classifier'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            class KnownItemSearchClassifierTest < Minitest::Test
         | 
| 8 | 
            +
                classifier = KnownItemSearchClassifier::Classifier.new
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                known_item_training_set = [
         | 
| 11 | 
            +
                    "little house on the",
         | 
| 12 | 
            +
                    "the inconvenient truth",
         | 
| 13 | 
            +
                    "the question of animal Culture by Kevin N Laland; Bennett G Galef ",
         | 
| 14 | 
            +
                    "Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction.  Viking. 2015. Print",
         | 
| 15 | 
            +
                    "The Boy in Zaquitos",
         | 
| 16 | 
            +
                    "The Mis-Education of the Negro",
         | 
| 17 | 
            +
                    "human relations interpersonal job-oriented skills",
         | 
| 18 | 
            +
                    "Research Methods for Business: A Skill-Building Approach Effectiveness of Instruction Performed through Computer-Assisted Activity Schedules on On-Schedule and Role-Play Skills of Children with Autism Spectrum Disorder",
         | 
| 19 | 
            +
                    "competency skills for the dental assiostant",
         | 
| 20 | 
            +
                    "Why did they kill?: Cambodia in the shadow of genocide",
         | 
| 21 | 
            +
                    "salt sugar fat",
         | 
| 22 | 
            +
                    "Making a Killing: Femicide, Free Trade, and La Frontera",
         | 
| 23 | 
            +
                ]
         | 
| 24 | 
            +
                known_item_training_set.each do |query|
         | 
| 25 | 
            +
                    cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
         | 
| 26 | 
            +
                    define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
         | 
| 27 | 
            +
                        assert_equal(:known, classifier.is_known_item_search?(query))
         | 
| 28 | 
            +
                    end
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                unknown_item_training_set = [
         | 
| 32 | 
            +
                    "earthworms",
         | 
| 33 | 
            +
                    "network security",
         | 
| 34 | 
            +
                    "work stress",
         | 
| 35 | 
            +
                    "mummies",
         | 
| 36 | 
            +
                    "benefits of eating healthyhy",
         | 
| 37 | 
            +
                    "benefits of eating healthy",
         | 
| 38 | 
            +
                    "megadosing vitamin c",
         | 
| 39 | 
            +
                    "nutrition",
         | 
| 40 | 
            +
                    "penquin",
         | 
| 41 | 
            +
                    "bananas",
         | 
| 42 | 
            +
                    "food sourcing",
         | 
| 43 | 
            +
                    "whey protein",
         | 
| 44 | 
            +
                    "exotic animals",
         | 
| 45 | 
            +
                    "sweet home oregon",
         | 
| 46 | 
            +
                    "taylor swift",
         | 
| 47 | 
            +
                    "catholicism",
         | 
| 48 | 
            +
                    "Professional baking ",
         | 
| 49 | 
            +
                    "concussions after the nfl",
         | 
| 50 | 
            +
                    "IVF the US",
         | 
| 51 | 
            +
                    "adoption children the US",
         | 
| 52 | 
            +
                    "Films for the hearing impaired",
         | 
| 53 | 
            +
                    "wolves and the ecosystem",
         | 
| 54 | 
            +
                    "dr. martin luther king",
         | 
| 55 | 
            +
                ]
         | 
| 56 | 
            +
                unknown_item_training_set.each do |query|
         | 
| 57 | 
            +
                    cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
         | 
| 58 | 
            +
                    define_method("test_#{cleaned_up_query}_is_not_false_positive") do
         | 
| 59 | 
            +
                        assert_equal(:unknown, classifier.is_known_item_search?(query))
         | 
| 60 | 
            +
                    end
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,106 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: known_item_search_classifier
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.1.0
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors:
         | 
| 7 | 
            +
            - Jane Sandberg
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
            date: 2016-11-13 00:00:00.000000000 Z
         | 
| 12 | 
            +
            dependencies:
         | 
| 13 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 14 | 
            +
              name: engtagger
         | 
| 15 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 | 
            +
                requirements:
         | 
| 17 | 
            +
                - - ">="
         | 
| 18 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            +
                    version: 0.2.1
         | 
| 20 | 
            +
              type: :runtime
         | 
| 21 | 
            +
              prerelease: false
         | 
| 22 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 | 
            +
                requirements:
         | 
| 24 | 
            +
                - - ">="
         | 
| 25 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 26 | 
            +
                    version: 0.2.1
         | 
| 27 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 28 | 
            +
              name: gaussian_naive_bayes
         | 
| 29 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 | 
            +
                requirements:
         | 
| 31 | 
            +
                - - ">="
         | 
| 32 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 33 | 
            +
                    version: 0.1.1
         | 
| 34 | 
            +
              type: :runtime
         | 
| 35 | 
            +
              prerelease: false
         | 
| 36 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 | 
            +
                requirements:
         | 
| 38 | 
            +
                - - ">="
         | 
| 39 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 40 | 
            +
                    version: 0.1.1
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 42 | 
            +
              name: minitest
         | 
| 43 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 | 
            +
                requirements:
         | 
| 45 | 
            +
                - - ">="
         | 
| 46 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            +
                    version: '0'
         | 
| 48 | 
            +
              type: :development
         | 
| 49 | 
            +
              prerelease: false
         | 
| 50 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
                requirements:
         | 
| 52 | 
            +
                - - ">="
         | 
| 53 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                    version: '0'
         | 
| 55 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 56 | 
            +
              name: coveralls
         | 
| 57 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 58 | 
            +
                requirements:
         | 
| 59 | 
            +
                - - '='
         | 
| 60 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 61 | 
            +
                    version: 0.7.0
         | 
| 62 | 
            +
              type: :development
         | 
| 63 | 
            +
              prerelease: false
         | 
| 64 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 65 | 
            +
                requirements:
         | 
| 66 | 
            +
                - - '='
         | 
| 67 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 68 | 
            +
                    version: 0.7.0
         | 
| 69 | 
            +
            description: Classify search query strings
         | 
| 70 | 
            +
            email: sandbej@linnbenton.edu
         | 
| 71 | 
            +
            executables: []
         | 
| 72 | 
            +
            extensions: []
         | 
| 73 | 
            +
            extra_rdoc_files: []
         | 
| 74 | 
            +
            files:
         | 
| 75 | 
            +
            - lib/known_item_search_classifier.rb
         | 
| 76 | 
            +
            - lib/known_item_search_classifier/classifier.rb
         | 
| 77 | 
            +
            - lib/known_item_search_classifier/default_training_set.rb
         | 
| 78 | 
            +
            - lib/known_item_search_classifier/feature_extractor.rb
         | 
| 79 | 
            +
            - test/known_item_search_classifier_test.rb
         | 
| 80 | 
            +
            homepage: https://github.com/sandbergja/known_item_search_classifier
         | 
| 81 | 
            +
            licenses:
         | 
| 82 | 
            +
            - MIT
         | 
| 83 | 
            +
            metadata: {}
         | 
| 84 | 
            +
            post_install_message: 
         | 
| 85 | 
            +
            rdoc_options: []
         | 
| 86 | 
            +
            require_paths:
         | 
| 87 | 
            +
            - lib
         | 
| 88 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 89 | 
            +
              requirements:
         | 
| 90 | 
            +
              - - ">="
         | 
| 91 | 
            +
                - !ruby/object:Gem::Version
         | 
| 92 | 
            +
                  version: '0'
         | 
| 93 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 94 | 
            +
              requirements:
         | 
| 95 | 
            +
              - - ">="
         | 
| 96 | 
            +
                - !ruby/object:Gem::Version
         | 
| 97 | 
            +
                  version: '0'
         | 
| 98 | 
            +
            requirements: []
         | 
| 99 | 
            +
            rubyforge_project: 
         | 
| 100 | 
            +
            rubygems_version: 2.5.1
         | 
| 101 | 
            +
            signing_key: 
         | 
| 102 | 
            +
            specification_version: 4
         | 
| 103 | 
            +
            summary: A ruby gem that classifies search query strings as either known-item searches
         | 
| 104 | 
            +
              or unknown-item searches
         | 
| 105 | 
            +
            test_files:
         | 
| 106 | 
            +
            - test/known_item_search_classifier_test.rb
         |