tf-idf-similarity 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.travis.yml +1 -1
- data/README.md +7 -6
- data/lib/tf-idf-similarity.rb +0 -3
- data/lib/tf-idf-similarity/document.rb +7 -5
- data/lib/tf-idf-similarity/token.rb +7 -0
- data/lib/tf-idf-similarity/tokenizer.rb +19 -0
- data/lib/tf-idf-similarity/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 605ac457508eaf64a7e583e8a4a71af231d3d9d2f9c30ee82b25fb9f647d1312
|
4
|
+
data.tar.gz: f24b89dccdcbef3c4fcaa59d15050f064455859c134c550fd6a432346883eb31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a41195c6543dea206baa8ce3e2095437d1df94fabedcc76a8151fa5af5991524d96530710a7216c1fef48a7008f88a43773ce2a2323afa563fa29f5abed9909c
|
7
|
+
data.tar.gz: aadbb85d6bd74625088d0aa7cb58b4127337d5c1dcc2af13c22664f1562013c59d79d8b3bcc3564a2861dfd968d39770205d3b401114e8bdf870b2ac412fda26
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
@@ -18,7 +18,7 @@ addons:
|
|
18
18
|
# Installing ATLAS will install BLAS.
|
19
19
|
- libatlas-dev
|
20
20
|
- libatlas-base-dev
|
21
|
-
-
|
21
|
+
- libatlas3-base
|
22
22
|
before_install:
|
23
23
|
- bundle config build.nmatrix --with-lapacklib
|
24
24
|
- export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
|
data/README.md
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
-
# Ruby Vector Space Model (VSM) with tf
|
1
|
+
# Ruby Vector Space Model (VSM) with tf\*idf weights
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/tf-idf-similarity)
|
4
4
|
[](https://travis-ci.org/jpmckinney/tf-idf-similarity)
|
5
|
-
[](https://gemnasium.com/jpmckinney/tf-idf-similarity)
|
6
5
|
[](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
|
7
6
|
[](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
|
8
7
|
|
9
|
-
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf
|
8
|
+
Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
|
10
9
|
|
11
10
|
## Usage
|
12
11
|
|
@@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix:
|
|
48
47
|
matrix[model.document_index(document1), model.document_index(document2)]
|
49
48
|
```
|
50
49
|
|
51
|
-
Print the tf
|
50
|
+
Print the tf\*idf values for terms in a document:
|
52
51
|
|
53
52
|
```ruby
|
54
53
|
tfidf_by_term = {}
|
@@ -86,6 +85,8 @@ end
|
|
86
85
|
document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
|
87
86
|
```
|
88
87
|
|
88
|
+
Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
|
89
|
+
|
89
90
|
[Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
|
90
91
|
|
91
92
|
## Troubleshooting
|
@@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul
|
|
114
115
|
require 'tf-idf-similarity/extras/document'
|
115
116
|
require 'tf-idf-similarity/extras/tf_idf_model'
|
116
117
|
|
117
|
-
The default tf
|
118
|
+
The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
|
118
119
|
|
119
120
|
## Why?
|
120
121
|
|
121
|
-
At the time of writing, no other Ruby gem implemented the tf
|
122
|
+
At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
|
122
123
|
|
123
124
|
* [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
|
124
125
|
* [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
|
data/lib/tf-idf-similarity.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'tf-idf-similarity/tokenizer'
|
2
|
+
|
1
3
|
# A document.
|
2
4
|
module TfIdfSimilarity
|
3
5
|
class Document
|
@@ -19,7 +21,8 @@ module TfIdfSimilarity
|
|
19
21
|
def initialize(text, opts = {})
|
20
22
|
@text = text
|
21
23
|
@id = opts[:id] || object_id
|
22
|
-
@tokens = opts[:tokens]
|
24
|
+
@tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
|
25
|
+
@tokenizer = opts[:tokenizer] || Tokenizer.new
|
23
26
|
|
24
27
|
if opts[:term_counts]
|
25
28
|
@term_counts = opts[:term_counts]
|
@@ -51,10 +54,9 @@ module TfIdfSimilarity
|
|
51
54
|
|
52
55
|
# Tokenizes the text and counts terms and total tokens.
|
53
56
|
def set_term_counts_and_size
|
54
|
-
tokenize(text).each do |
|
55
|
-
token = Token.new(word)
|
57
|
+
tokenize(text).each do |token|
|
56
58
|
if token.valid?
|
57
|
-
term = token.
|
59
|
+
term = token.to_s
|
58
60
|
@term_counts[term] += 1
|
59
61
|
@size += 1
|
60
62
|
end
|
@@ -76,7 +78,7 @@ module TfIdfSimilarity
|
|
76
78
|
# @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
|
77
79
|
# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
|
78
80
|
def tokenize(text)
|
79
|
-
@tokens ||
|
81
|
+
@tokens || @tokenizer.tokenize(text)
|
80
82
|
end
|
81
83
|
end
|
82
84
|
end
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require 'delegate'
|
3
|
+
require 'unicode_utils/downcase'
|
4
|
+
require 'unicode_utils/each_word'
|
3
5
|
|
4
6
|
# A token.
|
5
7
|
#
|
@@ -47,5 +49,10 @@ module TfIdfSimilarity
|
|
47
49
|
def classic_filter
|
48
50
|
self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
|
49
51
|
end
|
52
|
+
|
53
|
+
def to_s
|
54
|
+
# Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
|
55
|
+
UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
|
56
|
+
end
|
50
57
|
end
|
51
58
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'unicode_utils/each_word'
|
2
|
+
require 'tf-idf-similarity/token'
|
3
|
+
|
4
|
+
# A tokenizer using UnicodeUtils to tokenize a text.
|
5
|
+
#
|
6
|
+
# @see https://github.com/lang/unicode_utils
|
7
|
+
module TfIdfSimilarity
|
8
|
+
class Tokenizer
|
9
|
+
# Tokenizes a text.
|
10
|
+
#
|
11
|
+
# @param [String] text
|
12
|
+
# @return [Enumerator] an enumerator of Token objects
|
13
|
+
def tokenize(text)
|
14
|
+
UnicodeUtils.each_word(text).map do |word|
|
15
|
+
Token.new(word)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tf-idf-similarity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James McKinney
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-12-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode_utils
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/tf-idf-similarity/term_count_model.rb
|
105
105
|
- lib/tf-idf-similarity/tf_idf_model.rb
|
106
106
|
- lib/tf-idf-similarity/token.rb
|
107
|
+
- lib/tf-idf-similarity/tokenizer.rb
|
107
108
|
- lib/tf-idf-similarity/version.rb
|
108
109
|
- spec/bm25_model_spec.rb
|
109
110
|
- spec/document_spec.rb
|
@@ -133,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
134
|
version: '0'
|
134
135
|
requirements: []
|
135
136
|
rubyforge_project:
|
136
|
-
rubygems_version: 2.
|
137
|
+
rubygems_version: 2.7.6
|
137
138
|
signing_key:
|
138
139
|
specification_version: 4
|
139
140
|
summary: Calculates the similarity between texts using tf*idf
|