doc_rank 0.0.0.alpha.2 → 0.0.0.alpha.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/doc_rank.gemspec +4 -3
- data/lib/doc_rank.rb +1 -12
- data/lib/doc_rank/document.rb +6 -4
- data/lib/doc_rank/ranker.rb +54 -0
- data/lib/doc_rank/version.rb +1 -1
- metadata +15 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78aa3f24fc74f25238b5e0888027a4d3debd9b44
|
4
|
+
data.tar.gz: 989694cd61a85196089db8d8a6c3a2977ad14cb7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e1536fe620ca05693665e1524863cfcaaa5cd99d61310641ea196b5580ecd6816d707c0d55e0a22b6a47e44cf708066dad0549684135bcb93a64809bdb57aa59
|
7
|
+
data.tar.gz: c0a0a7b0d9307d51c26aef1c1fe74de3c9de7d4231f6249df076cf9d069a3b1b4036377d30b884fbba372953a597d8de98943819c4844ed5c1e5f78e50620d95
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,15 @@ For more information about changelogs, check
|
|
6
6
|
[Keep a Changelog](http://keepachangelog.com) and
|
7
7
|
[Vandamme](http://tech-angels.github.io/vandamme).
|
8
8
|
|
9
|
+
## 0.0.0.alpha.3 - 2017
|
10
|
+
|
11
|
+
* [FEATURE] Add `DocRank::Ranker#base_doc`
|
12
|
+
* [FEATURE] Add `DocRank::Ranker#target_docs`
|
13
|
+
* [FEATURE] Add `DocRank::Ranker#scores`
|
14
|
+
* [FEATURE] Add `DocRank::Ranker#weighted_scores`
|
15
|
+
* [FEATURE] Change `DocRank::Document#keywords to DocRank::Document#weighted_keywords`
|
16
|
+
* [FEATURE] Remove `DocRank.compare`
|
17
|
+
|
9
18
|
## 0.0.0.alpha.2 - 2017/09/07
|
10
19
|
|
11
20
|
* [FEATURE] Add `DocRank.compare`
|
data/doc_rank.gemspec
CHANGED
@@ -13,6 +13,8 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.homepage = "https://github.com/S1v4/doc_rank"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
|
+
spec.required_ruby_version = '>= 2.4'
|
17
|
+
|
16
18
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
17
19
|
f.match(%r{^(test|spec|features)/})
|
18
20
|
end
|
@@ -20,13 +22,12 @@ Gem::Specification.new do |spec|
|
|
20
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
23
|
spec.require_paths = ["lib"]
|
22
24
|
|
25
|
+
spec.add_runtime_dependency "highscore", "~> 1.2.1"
|
26
|
+
spec.add_runtime_dependency "yomu", "~> 0.2.4"
|
23
27
|
spec.add_development_dependency "yard", "~> 0.9.9"
|
24
|
-
spec.add_development_dependency "highscore", "~> 1.2.1"
|
25
|
-
spec.add_development_dependency "yomu", "~> 0.2.4"
|
26
28
|
spec.add_development_dependency "bundler", "~> 1.15"
|
27
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
28
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
29
31
|
spec.add_development_dependency "coveralls"
|
30
|
-
spec.add_development_dependency "pry"
|
31
32
|
spec.add_development_dependency "pry-nav"
|
32
33
|
end
|
data/lib/doc_rank.rb
CHANGED
@@ -1,17 +1,6 @@
|
|
1
1
|
require "doc_rank/version"
|
2
|
-
require "doc_rank/
|
2
|
+
require "doc_rank/ranker"
|
3
3
|
|
4
4
|
# A gem to rank document similarities.
|
5
5
|
module DocRank
|
6
|
-
# @return [Hash] a hash of document names mapped to their score.
|
7
|
-
# @param [String] base the file by which target documents are scored againsts.
|
8
|
-
# @param [Array<String>] targets a single file or many files used for comparison.
|
9
|
-
def self.compare(base, targets)
|
10
|
-
base_keywords = DocRank::Document.new(base).keywords.map(&:text)
|
11
|
-
Array(targets).map do |target|
|
12
|
-
doc = DocRank::Document.new target
|
13
|
-
matches = base_keywords & doc.keywords.map(&:text)
|
14
|
-
[doc.name, matches.size]
|
15
|
-
end.sort_by{|doc| doc[1]}.to_h
|
16
|
-
end
|
17
6
|
end
|
data/lib/doc_rank/document.rb
CHANGED
@@ -4,21 +4,23 @@ require 'highscore'
|
|
4
4
|
# A gem to rank document similarities.
|
5
5
|
module DocRank
|
6
6
|
# Document holds the contents of a document (.pdf, .docx, .txt) and has methods:
|
7
|
-
# name, text, and
|
7
|
+
# name, text, and weighted_keywords.
|
8
8
|
class Document
|
9
9
|
|
10
10
|
# [String] the name of the document.
|
11
11
|
attr_reader :name
|
12
12
|
# [String] the text content of the document.
|
13
13
|
attr_reader :text
|
14
|
-
# [
|
15
|
-
attr_reader :
|
14
|
+
# [Hash] a collection of keywords mapped to their weight.
|
15
|
+
attr_reader :weighted_keywords
|
16
16
|
|
17
17
|
# @param [String] file The path to the file.
|
18
18
|
def initialize(file_path)
|
19
19
|
@text = Yomu.new(file_path).text
|
20
20
|
@name = File.basename file_path
|
21
|
-
@
|
21
|
+
@weighted_keywords = Highscore::Content.new(@text).keywords.map do |kw|
|
22
|
+
[kw.text, kw.weight]
|
23
|
+
end.to_h
|
22
24
|
end
|
23
25
|
end
|
24
26
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require "doc_rank/document"
|
2
|
+
|
3
|
+
# A gem to rank document similarities.
|
4
|
+
module DocRank
|
5
|
+
# Ranker ranks documents by keywords and stores them in: scores and weighted_scores.
|
6
|
+
class Ranker
|
7
|
+
|
8
|
+
# [DocRank::Document] the base document.
|
9
|
+
attr_reader :base_doc
|
10
|
+
# [Array<DocRank::Document>] the target documents.
|
11
|
+
attr_reader :target_docs
|
12
|
+
|
13
|
+
# @param [String] base the file by which target documents are scored againsts.
|
14
|
+
# @param [Array<String>] targets a single file or many files used for comparison.
|
15
|
+
def initialize(base, targets)
|
16
|
+
@base_doc = DocRank::Document.new base
|
17
|
+
@base_kws = @base_doc.weighted_keywords
|
18
|
+
@target_docs = Array(targets).map {|target| DocRank::Document.new target }
|
19
|
+
end
|
20
|
+
|
21
|
+
# Rank the target documents based on any keyword matches with the base
|
22
|
+
# document regardless of weight.
|
23
|
+
# @return [Hash] a hash of document names mapped to their matches.
|
24
|
+
def scores
|
25
|
+
@score ||= rank match_keywords
|
26
|
+
end
|
27
|
+
|
28
|
+
# Rank the target documents based on any keyword matches with the base
|
29
|
+
# document by summing the weights of any match.
|
30
|
+
# @return [Hash] a hash of document names mapped to their scores.
|
31
|
+
def weighted_scores
|
32
|
+
@weighted_scores ||= rank match_keywords(weighted: true)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def match_keywords(options = {})
|
38
|
+
@target_docs.map do |target|
|
39
|
+
target_kws = target.weighted_keywords
|
40
|
+
matches = @base_kws.keys & target_kws.keys
|
41
|
+
score = options[:weighted] ? sum_weights(matches, target_kws) : matches.count
|
42
|
+
[target.name, score]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def sum_weights(matches, target)
|
47
|
+
matches.sum {|match| @base_kws[match] + target[match]}
|
48
|
+
end
|
49
|
+
|
50
|
+
def rank(docs)
|
51
|
+
docs.sort_by{|doc| doc[1]}.to_h
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/doc_rank/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doc_rank
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.0.alpha.
|
4
|
+
version: 0.0.0.alpha.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- s1v4
|
@@ -9,50 +9,50 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-09-
|
12
|
+
date: 2017-09-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
15
|
+
name: highscore
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
18
|
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version:
|
21
|
-
type: :
|
20
|
+
version: 1.2.1
|
21
|
+
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
27
|
+
version: 1.2.1
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
|
-
name:
|
29
|
+
name: yomu
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
32
|
- - "~>"
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version:
|
35
|
-
type: :
|
34
|
+
version: 0.2.4
|
35
|
+
type: :runtime
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
39
|
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: 0.2.4
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
|
-
name:
|
43
|
+
name: yard
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|
45
45
|
requirements:
|
46
46
|
- - "~>"
|
47
47
|
- !ruby/object:Gem::Version
|
48
|
-
version: 0.
|
48
|
+
version: 0.9.9
|
49
49
|
type: :development
|
50
50
|
prerelease: false
|
51
51
|
version_requirements: !ruby/object:Gem::Requirement
|
52
52
|
requirements:
|
53
53
|
- - "~>"
|
54
54
|
- !ruby/object:Gem::Version
|
55
|
-
version: 0.
|
55
|
+
version: 0.9.9
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
57
|
name: bundler
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
@@ -109,20 +109,6 @@ dependencies:
|
|
109
109
|
- - ">="
|
110
110
|
- !ruby/object:Gem::Version
|
111
111
|
version: '0'
|
112
|
-
- !ruby/object:Gem::Dependency
|
113
|
-
name: pry
|
114
|
-
requirement: !ruby/object:Gem::Requirement
|
115
|
-
requirements:
|
116
|
-
- - ">="
|
117
|
-
- !ruby/object:Gem::Version
|
118
|
-
version: '0'
|
119
|
-
type: :development
|
120
|
-
prerelease: false
|
121
|
-
version_requirements: !ruby/object:Gem::Requirement
|
122
|
-
requirements:
|
123
|
-
- - ">="
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
126
112
|
- !ruby/object:Gem::Dependency
|
127
113
|
name: pry-nav
|
128
114
|
requirement: !ruby/object:Gem::Requirement
|
@@ -158,6 +144,7 @@ files:
|
|
158
144
|
- doc_rank.gemspec
|
159
145
|
- lib/doc_rank.rb
|
160
146
|
- lib/doc_rank/document.rb
|
147
|
+
- lib/doc_rank/ranker.rb
|
161
148
|
- lib/doc_rank/version.rb
|
162
149
|
homepage: https://github.com/S1v4/doc_rank
|
163
150
|
licenses:
|
@@ -171,7 +158,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
171
158
|
requirements:
|
172
159
|
- - ">="
|
173
160
|
- !ruby/object:Gem::Version
|
174
|
-
version: '
|
161
|
+
version: '2.4'
|
175
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
163
|
requirements:
|
177
164
|
- - ">"
|