known_item_search_classifier 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a95567708e0b56c79c3a102e1d7c72e493e5660518de3b24c8fc42a691609938
4
- data.tar.gz: 70ea59d9d7c0451b3d454506e578c2761c12e7d226edca852431c76bee1a9456
3
+ metadata.gz: 026e2cc9387f7e1bbfd52bcf37d24a32e5f44665f4217e8e87276fff08cd9951
4
+ data.tar.gz: 2c4d42876c29118a017397d1cb9a55a6e79323cbca71929b6f1ff72b414fd353
5
5
  SHA512:
6
- metadata.gz: 4fb37b0932e9e0c32f9ec0ef6bdc563bd7e4e4cca5f401186daec4ae8d3be112b96478a9f04cf715620144e5db30e340959db808d5cc99841360dd72d480984d
7
- data.tar.gz: 96777f8fa22a9208dc4e22a76a4c74dd57785c32a52de386d0e78678880a0d0faa020e0390fe8d1275c2cc3326cdf6cfa2fae6dcee4bbe299c66c79918a696fd
6
+ metadata.gz: '0295cfeddf0ed53d31a39053b5cbded6ef311eaf319a056564889953a592aa89e379a10274b7972f9109b8066fd594932b4511d2cbf4da72b06df65643cfba2b'
7
+ data.tar.gz: ec9c1cfb9ad88193d7c8b0d52d31b0b9744c4d2862f4fc868240678219632cb6a364c4ff14180de66d5aa95e5a4f293de85bea45fc7d132069b41d9ec9c1e818
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Classifies search strings as either known-item searches or unknown-item searches
2
4
  require 'csv'
3
5
  require 'gaussian_naive_bayes'
@@ -29,9 +31,9 @@ module KnownItemSearchClassifier
29
31
  end
30
32
  end
31
33
 
32
- private
34
+ attr_reader :custom_training_set
33
35
 
34
- attr_reader :custom_tr
36
+ private
35
37
 
36
38
  def classify(string)
37
39
  f = FeatureExtractor.new string
@@ -43,7 +45,7 @@ module KnownItemSearchClassifier
43
45
  query_class = @default_training_set.classify(feature_array)
44
46
  end
45
47
  return query_class
46
- return true if :known == query_class
48
+ return true if query_class == :known
47
49
 
48
50
  false
49
51
  end
@@ -1,24 +1,26 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module KnownItemSearchClassifier
2
4
  class DefaultTrainingSet
3
- attr_reader :categories_probabilities, :categories_summaries
5
+ def categories_probabilities
6
+ { 'unknown' => 0.552, 'known' => 0.448 }
7
+ end
4
8
 
5
- def initialize
6
- @categories_probabilities = { 'known' => 0.3333333333333333, 'unknown' => 0.6666666666666666 }
7
- @categories_summaries =
8
- { 'known' =>
9
- { 0 => { mean: 0.6, standard_deviation: 0.5 },
10
- 1 => { mean: 0.0516060606060606, standard_deviation: 0.09910312916958242 },
11
- 2 => { mean: 0.06633333333333333, standard_deviation: 0.13412266359153804 },
12
- 3 => { mean: 0.2575454545454545, standard_deviation: 0.27976953051588926 },
13
- 4 => { mean: 4.76, standard_deviation: 3.8867295592395754 },
14
- 5 => { mean: 3.48, standard_deviation: 4.91697739131132 } },
15
- 'unknown' =>
16
- { 0 => { mean: 0.18, standard_deviation: 0.38808793449160356 },
17
- 1 => { mean: 0.03966666666666667, standard_deviation: 0.1241245990920947 },
18
- 2 => { mean: 0.009000000000000001, standard_deviation: 0.04482391854210637 },
19
- 3 => { mean: 0.11, standard_deviation: 0.25134558515041244 },
20
- 4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
21
- 5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
9
+ def categories_summaries
10
+ { 'unknown' =>
11
+ { 0 => { mean: 0.34782608695652173, standard_deviation: 0.4774351058385226 },
12
+ 1 => { mean: 0.024483651893859832, standard_deviation: 0.09877772267472562 },
13
+ 2 => { mean: 0.004589371980676328, standard_deviation: 0.032912070012630565 },
14
+ 3 => { mean: 0.23732759224252611, standard_deviation: 0.3775861432826544 },
15
+ 4 => { mean: 2.5072463768115942, standard_deviation: 1.7511981064219535 },
16
+ 5 => { mean: 0.05314009661835749, standard_deviation: 0.44293515127930905 } },
17
+ 'known' =>
18
+ { 0 => { mean: 0.6011904761904762, standard_deviation: 0.49111727628992086 },
19
+ 1 => { mean: 0.05071208898451003, standard_deviation: 0.10551578513388955 },
20
+ 2 => { mean: 0.043913293286663324, standard_deviation: 0.089967379556707 },
21
+ 3 => { mean: 0.24845155590636964, standard_deviation: 0.3038306400149365 },
22
+ 4 => { mean: 5.071428571428571, standard_deviation: 4.292552286529769 },
23
+ 5 => { mean: 2.8214285714285716, standard_deviation: 4.933475325294178 } } }
22
24
  end
23
25
  end
24
26
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'engtagger'
2
4
 
3
5
  module KnownItemSearchClassifier
@@ -25,7 +27,7 @@ module KnownItemSearchClassifier
25
27
  private
26
28
 
27
29
  def is_mixed_case?
28
- return 1.0 if @string =~ /[A-Z]/ and @string =~ /[a-z]/
30
+ return 1.0 if @string =~ (/[A-Z]/) && @string =~ (/[a-z]/)
29
31
 
30
32
  0.0
31
33
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'known_item_search_classifier/default_training_set'
2
4
  require 'known_item_search_classifier/feature_extractor'
3
5
  require 'known_item_search_classifier/classifier'
@@ -1,46 +1,148 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'minitest/autorun'
2
4
  require './lib/known_item_search_classifier'
3
5
 
4
6
  class KnownItemSearchClassifierTest < Minitest::Test
5
7
  classifier = KnownItemSearchClassifier::Classifier.new
6
8
 
7
- known_item_training_set = [
8
- # 'hobbit first edition', -- classifier incorrectly classifies this as unknown
9
- # 'my soul is rested', -- classifier incorrectly classifies this as unknown
10
- # 'new yorker', -- classifier incorrectly classifies this as unknown
11
- # 'when harry met sally', -- classifier incorrectly classifies this as unknown
12
- # '"neo tekunoroji"', -- classifier incorrectly classifies this as unknown
13
- '99131236427206421',
14
- 'A decision making model for selecting start-up businesses in a government venture capital scheme',
15
- # 'Dostoevsky Brothers Karamazov', -- classifier incorrectly classifies this as unknown
16
- # 'Lawrence Classic American Literature', -- classifier incorrectly classifies this as unknown
17
- # 'salt sugar fat', -- classifier incorrectly classifies this as unknown
18
- 'Robinson Ken. Creative Schools: The Grassroots Revolution That’s Transforming Eduction. Viking. 2015. Print',
19
- 'the inconvenient truth',
20
- 'Polarization: What Everyone Needs to Know',
21
- 'little house on the'
9
+ known_items = [
10
+ '0036-8075',
11
+ '9781324033356',
12
+ 'British Documents on the End of Empire',
13
+ '9780393979503',
14
+ '9781319339425',
15
+ 'D810.W7 A5313 2017b',
16
+ '9783031512117',
17
+ '9780470044377',
18
+ '9780393609615',
19
+ 'AC8 .B4353 1979',
20
+ # 'Kafka metamorphosis',
21
+ # 'how children develop',
22
+ # 'milton\'s paradise lost',
23
+ '9780471140260',
24
+ # 'Fortaleciendo la participación política de las mujeres',
25
+ # 'historia augusta',
26
+ # 'karl marx letters',
27
+ # 'stuck in place',
28
+ 'Court and Country: Studies in Tudor Social History',
29
+ 'E184.6 .L48 2009',
30
+ 'JK274 .K62 1995',
31
+ 'JZ1480 .H68 2013',
32
+ 'TD186.5.B35 A4413 2016',
33
+ 'Women and gender issues in Bolivia, II, 1991-2003.',
34
+ # 'arrow of god',
35
+ # 'going stealth',
36
+ # 'greek dictionary brill',
37
+ # 'historia de los indios motolinia',
38
+ '9780143115267',
39
+ '9780321573513',
40
+ '9788498956160',
41
+ 'AC25 .B3132 1972b',
42
+ 'Agrarian issues in Bolivia, I, 1989-2004.',
43
+ 'Indigenous peoples, peasants, and ethnic minorities in Bolivia, I, 1970-2005.',
44
+ 'MT6 .C57 2016',
45
+ # 'Plato\'s symposium',
46
+ 'le droit a la ville',
47
+ # 'providence journal',
48
+ 'the atlantic',
49
+ '9780137605521',
50
+ '9788876768170',
51
+ # 'Book of Mormon',
52
+ # 'Euclid\'s elements'
22
53
  ]
23
- known_item_training_set.each do |query|
54
+ known_items.each do |query|
24
55
  cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
25
56
  define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
26
57
  assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
27
58
  end
28
59
  end
29
60
 
30
- unknown_item_training_set = [
31
- 'colonial mexico textiles',
32
- 'history of horses',
33
- 'medical expertise COVID',
34
- 'music and sexuality',
35
- 'paper industry',
36
- 'sun ra',
37
- # 'concussions after the nfl', -- classifier incorrectly classifies this as known
38
- 'Professional baking ',
39
- 'Manos chatzidakis',
40
- 'whey protein',
41
- 'benefits of eating healthyhy'
61
+ unknown_items = [
62
+ '"paul parin"',
63
+ 'architecture new orleans',
64
+ 'Al-alusi',
65
+ 'animals NOT coin / ',
66
+ 'hanne darboven',
67
+ 'William Burroughs',
68
+ 'Platonic eros',
69
+ 'cavafy',
70
+ 'Marcel proust',
71
+ 'Zisterzienser',
72
+ 'costume',
73
+ 'cultural delegation',
74
+ 'Akhir al-Zaman',
75
+ 'Heidegger',
76
+ 'Nietzsche',
77
+ 'Wind in poetry',
78
+ 'argentina',
79
+ 'facades architecture',
80
+ 'medieval women',
81
+ 'monrovia urbanization',
82
+ 'philadelphia',
83
+ 'roman theater',
84
+ 'Holderline',
85
+ 'Wittgenstein',
86
+ 'hindi history politics',
87
+ 'post-industrial society',
88
+ 'Luxemburg',
89
+ 'Tasso',
90
+ 'Women in Scandinavia',
91
+ 'corporate governance',
92
+ 'dance technique',
93
+ 'water atlas',
94
+ 'Diskin Clay',
95
+ 'charlotte posenenske',
96
+ 'dinosaur',
97
+ 'paul rudolph',
98
+ 'urdu hindi history',
99
+ 'Ashrat al',
100
+ 'Ephemera',
101
+ 'John Hopfield',
102
+ 'Russian formalism',
103
+ 'artificial intelligence',
104
+ 'bats',
105
+ # 'coffee and the american revolution',
106
+ 'cold war middle east',
107
+ 'environmental history ethiopia',
108
+ 'indigenous',
109
+ 'politbiuro',
110
+ 'stoicism',
111
+ 'suetonius',
112
+ 'tokyo edo',
113
+ 'washington dc',
114
+ 'Achaean war',
115
+ 'Isabella piccini',
116
+ 'Renato Sollima',
117
+ 'animals',
118
+ 'auguste leroux',
119
+ 'blockchain',
120
+ 'britain navy diary',
121
+ 'fairy',
122
+ 'jose guadalupe posada',
123
+ 'lesbian',
124
+ 'physics textbook',
125
+ 'walter benjamin',
126
+ '"Middlebury College Museum of Art"',
127
+ '"daniel catan"',
128
+ 'Hellenistic kingdoms',
129
+ 'Albrecht Classen',
130
+ 'Athir al-din al-abhari',
131
+ 'Berlin',
132
+ 'David konstan',
133
+ 'Elizabeth Cary Mariam',
134
+ 'Euclid',
135
+ 'George sand on literature',
136
+ 'Kyrgyz art',
137
+ 'Slavic folklore',
138
+ 'mass incarceration california',
139
+ 'military history korean war',
140
+ 'snakes',
141
+ 'sondheim',
142
+ '"bronx"',
143
+ 'Between Hermon and Sinai'
42
144
  ]
43
- unknown_item_training_set.each do |query|
145
+ unknown_items.each do |query|
44
146
  cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
45
147
  define_method("test_#{cleaned_up_query}_is_not_false_positive") do
46
148
  assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: known_item_search_classifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jane Sandberg