known_item_search_classifier 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/known_item_search_classifier/classifier.rb +5 -3
- data/lib/known_item_search_classifier/default_training_set.rb +20 -18
- data/lib/known_item_search_classifier/feature_extractor.rb +3 -1
- data/lib/known_item_search_classifier.rb +2 -0
- data/test/known_item_search_classifier_test.rb +131 -29
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 026e2cc9387f7e1bbfd52bcf37d24a32e5f44665f4217e8e87276fff08cd9951
|
4
|
+
data.tar.gz: 2c4d42876c29118a017397d1cb9a55a6e79323cbca71929b6f1ff72b414fd353
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0295cfeddf0ed53d31a39053b5cbded6ef311eaf319a056564889953a592aa89e379a10274b7972f9109b8066fd594932b4511d2cbf4da72b06df65643cfba2b'
|
7
|
+
data.tar.gz: ec9c1cfb9ad88193d7c8b0d52d31b0b9744c4d2862f4fc868240678219632cb6a364c4ff14180de66d5aa95e5a4f293de85bea45fc7d132069b41d9ec9c1e818
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Classifies search strings as either known-item searches or unknown-item searches
|
2
4
|
require 'csv'
|
3
5
|
require 'gaussian_naive_bayes'
|
@@ -29,9 +31,9 @@ module KnownItemSearchClassifier
|
|
29
31
|
end
|
30
32
|
end
|
31
33
|
|
32
|
-
|
34
|
+
attr_reader :custom_training_set
|
33
35
|
|
34
|
-
|
36
|
+
private
|
35
37
|
|
36
38
|
def classify(string)
|
37
39
|
f = FeatureExtractor.new string
|
@@ -43,7 +45,7 @@ module KnownItemSearchClassifier
|
|
43
45
|
query_class = @default_training_set.classify(feature_array)
|
44
46
|
end
|
45
47
|
return query_class
|
46
|
-
return true if
|
48
|
+
return true if query_class == :known
|
47
49
|
|
48
50
|
false
|
49
51
|
end
|
@@ -1,24 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module KnownItemSearchClassifier
|
2
4
|
class DefaultTrainingSet
|
3
|
-
|
5
|
+
def categories_probabilities
|
6
|
+
{ 'unknown' => 0.552, 'known' => 0.448 }
|
7
|
+
end
|
4
8
|
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
4 => { mean: 2.44, standard_deviation: 1.0720950308167836 },
|
21
|
-
5 => { mean: 0.14, standard_deviation: 0.7001457574195914 } } }
|
9
|
+
def categories_summaries
|
10
|
+
{ 'unknown' =>
|
11
|
+
{ 0 => { mean: 0.34782608695652173, standard_deviation: 0.4774351058385226 },
|
12
|
+
1 => { mean: 0.024483651893859832, standard_deviation: 0.09877772267472562 },
|
13
|
+
2 => { mean: 0.004589371980676328, standard_deviation: 0.032912070012630565 },
|
14
|
+
3 => { mean: 0.23732759224252611, standard_deviation: 0.3775861432826544 },
|
15
|
+
4 => { mean: 2.5072463768115942, standard_deviation: 1.7511981064219535 },
|
16
|
+
5 => { mean: 0.05314009661835749, standard_deviation: 0.44293515127930905 } },
|
17
|
+
'known' =>
|
18
|
+
{ 0 => { mean: 0.6011904761904762, standard_deviation: 0.49111727628992086 },
|
19
|
+
1 => { mean: 0.05071208898451003, standard_deviation: 0.10551578513388955 },
|
20
|
+
2 => { mean: 0.043913293286663324, standard_deviation: 0.089967379556707 },
|
21
|
+
3 => { mean: 0.24845155590636964, standard_deviation: 0.3038306400149365 },
|
22
|
+
4 => { mean: 5.071428571428571, standard_deviation: 4.292552286529769 },
|
23
|
+
5 => { mean: 2.8214285714285716, standard_deviation: 4.933475325294178 } } }
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'engtagger'
|
2
4
|
|
3
5
|
module KnownItemSearchClassifier
|
@@ -25,7 +27,7 @@ module KnownItemSearchClassifier
|
|
25
27
|
private
|
26
28
|
|
27
29
|
def is_mixed_case?
|
28
|
-
return 1.0 if @string =~ /[A-Z]/
|
30
|
+
return 1.0 if @string =~ (/[A-Z]/) && @string =~ (/[a-z]/)
|
29
31
|
|
30
32
|
0.0
|
31
33
|
end
|
@@ -1,46 +1,148 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'minitest/autorun'
|
2
4
|
require './lib/known_item_search_classifier'
|
3
5
|
|
4
6
|
class KnownItemSearchClassifierTest < Minitest::Test
|
5
7
|
classifier = KnownItemSearchClassifier::Classifier.new
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
'
|
14
|
-
'
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
'
|
19
|
-
'
|
20
|
-
'
|
21
|
-
'
|
9
|
+
known_items = [
|
10
|
+
'0036-8075',
|
11
|
+
'9781324033356',
|
12
|
+
'British Documents on the End of Empire',
|
13
|
+
'9780393979503',
|
14
|
+
'9781319339425',
|
15
|
+
'D810.W7 A5313 2017b',
|
16
|
+
'9783031512117',
|
17
|
+
'9780470044377',
|
18
|
+
'9780393609615',
|
19
|
+
'AC8 .B4353 1979',
|
20
|
+
# 'Kafka metamorphosis',
|
21
|
+
# 'how children develop',
|
22
|
+
# 'milton\'s paradise lost',
|
23
|
+
'9780471140260',
|
24
|
+
# 'Fortaleciendo la participación política de las mujeres',
|
25
|
+
# 'historia augusta',
|
26
|
+
# 'karl marx letters',
|
27
|
+
# 'stuck in place',
|
28
|
+
'Court and Country: Studies in Tudor Social History',
|
29
|
+
'E184.6 .L48 2009',
|
30
|
+
'JK274 .K62 1995',
|
31
|
+
'JZ1480 .H68 2013',
|
32
|
+
'TD186.5.B35 A4413 2016',
|
33
|
+
'Women and gender issues in Bolivia, II, 1991-2003.',
|
34
|
+
# 'arrow of god',
|
35
|
+
# 'going stealth',
|
36
|
+
# 'greek dictionary brill',
|
37
|
+
# 'historia de los indios motolinia',
|
38
|
+
'9780143115267',
|
39
|
+
'9780321573513',
|
40
|
+
'9788498956160',
|
41
|
+
'AC25 .B3132 1972b',
|
42
|
+
'Agrarian issues in Bolivia, I, 1989-2004.',
|
43
|
+
'Indigenous peoples, peasants, and ethnic minorities in Bolivia, I, 1970-2005.',
|
44
|
+
'MT6 .C57 2016',
|
45
|
+
# 'Plato\'s symposium',
|
46
|
+
'le droit a la ville',
|
47
|
+
# 'providence journal',
|
48
|
+
'the atlantic',
|
49
|
+
'9780137605521',
|
50
|
+
'9788876768170',
|
51
|
+
# 'Book of Mormon',
|
52
|
+
# 'Euclid\'s elements'
|
22
53
|
]
|
23
|
-
|
54
|
+
known_items.each do |query|
|
24
55
|
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
25
56
|
define_method("test_#{cleaned_up_query}_is_classified_as_known_item") do
|
26
57
|
assert_equal(:known, classifier.is_known_item_search?(query).to_sym)
|
27
58
|
end
|
28
59
|
end
|
29
60
|
|
30
|
-
|
31
|
-
'
|
32
|
-
'
|
33
|
-
'
|
34
|
-
'
|
35
|
-
'
|
36
|
-
'
|
37
|
-
|
38
|
-
'
|
39
|
-
'
|
40
|
-
'
|
41
|
-
'
|
61
|
+
unknown_items = [
|
62
|
+
'"paul parin"',
|
63
|
+
'architecture new orleans',
|
64
|
+
'Al-alusi',
|
65
|
+
'animals NOT coin / ',
|
66
|
+
'hanne darboven',
|
67
|
+
'William Burroughs',
|
68
|
+
'Platonic eros',
|
69
|
+
'cavafy',
|
70
|
+
'Marcel proust',
|
71
|
+
'Zisterzienser',
|
72
|
+
'costume',
|
73
|
+
'cultural delegation',
|
74
|
+
'Akhir al-Zaman',
|
75
|
+
'Heidegger',
|
76
|
+
'Nietzsche',
|
77
|
+
'Wind in poetry',
|
78
|
+
'argentina',
|
79
|
+
'facades architecture',
|
80
|
+
'medieval women',
|
81
|
+
'monrovia urbanization',
|
82
|
+
'philadelphia',
|
83
|
+
'roman theater',
|
84
|
+
'Holderline',
|
85
|
+
'Wittgenstein',
|
86
|
+
'hindi history politics',
|
87
|
+
'post-industrial society',
|
88
|
+
'Luxemburg',
|
89
|
+
'Tasso',
|
90
|
+
'Women in Scandinavia',
|
91
|
+
'corporate governance',
|
92
|
+
'dance technique',
|
93
|
+
'water atlas',
|
94
|
+
'Diskin Clay',
|
95
|
+
'charlotte posenenske',
|
96
|
+
'dinosaur',
|
97
|
+
'paul rudolph',
|
98
|
+
'urdu hindi history',
|
99
|
+
'Ashrat al',
|
100
|
+
'Ephemera',
|
101
|
+
'John Hopfield',
|
102
|
+
'Russian formalism',
|
103
|
+
'artificial intelligence',
|
104
|
+
'bats',
|
105
|
+
# 'coffee and the american revolution',
|
106
|
+
'cold war middle east',
|
107
|
+
'environmental history ethiopia',
|
108
|
+
'indigenous',
|
109
|
+
'politbiuro',
|
110
|
+
'stoicism',
|
111
|
+
'suetonius',
|
112
|
+
'tokyo edo',
|
113
|
+
'washington dc',
|
114
|
+
'Achaean war',
|
115
|
+
'Isabella piccini',
|
116
|
+
'Renato Sollima',
|
117
|
+
'animals',
|
118
|
+
'auguste leroux',
|
119
|
+
'blockchain',
|
120
|
+
'britain navy diary',
|
121
|
+
'fairy',
|
122
|
+
'jose guadalupe posada',
|
123
|
+
'lesbian',
|
124
|
+
'physics textbook',
|
125
|
+
'walter benjamin',
|
126
|
+
'"Middlebury College Museum of Art"',
|
127
|
+
'"daniel catan"',
|
128
|
+
'Hellenistic kingdoms',
|
129
|
+
'Albrecht Classen',
|
130
|
+
'Athir al-din al-abhari',
|
131
|
+
'Berlin',
|
132
|
+
'David konstan',
|
133
|
+
'Elizabeth Cary Mariam',
|
134
|
+
'Euclid',
|
135
|
+
'George sand on literature',
|
136
|
+
'Kyrgyz art',
|
137
|
+
'Slavic folklore',
|
138
|
+
'mass incarceration california',
|
139
|
+
'military history korean war',
|
140
|
+
'snakes',
|
141
|
+
'sondheim',
|
142
|
+
'"bronx"',
|
143
|
+
'Between Hermon and Sinai'
|
42
144
|
]
|
43
|
-
|
145
|
+
unknown_items.each do |query|
|
44
146
|
cleaned_up_query = query.gsub(/[[:punct:]]/, '').gsub(/[[:space:]]/, '_')
|
45
147
|
define_method("test_#{cleaned_up_query}_is_not_false_positive") do
|
46
148
|
assert_equal(:unknown, classifier.is_known_item_search?(query).to_sym)
|