documentally 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f7ba9de32ab5018f410bf3e823c9f8a219b52fc0
4
+ data.tar.gz: 723aba4f6bc91aaa0b8c189b4026af67b5bee329
5
+ SHA512:
6
+ metadata.gz: 1ed6ff89c2a3866d00289a21592b03ce14722730dbb1709ceb3a4d14549bfe3e3e2c74cfa71a2518b57c8944f27bf821766c5425fd34b231f85e1020a973832a
7
+ data.tar.gz: 7081161224242150f5d9cf2a15d31edfeb765bb6bdc8150e9328fb3316e9995bf8b2885e59f2aae438e9bd79909f237aa768eec4008f24504a50dddf6859f527
data/README.md ADDED
@@ -0,0 +1,23 @@
1
+ # Documentally
2
+
3
+ Documentally uses [TF-IDF](http://en.wikipedia.org/wiki/tf-idf) to allow you to easily search and compare text documents.
4
+
5
+ It's still under development, so the documentation is basically just the tests right now, but an example use would be:
6
+
7
+ ```ruby
8
+ term_lists = [['doc_1', ['foo', 'foo', 'bar']],
9
+ ['doc_2', ['bar', 'baz', 'baz']],
10
+ ['doc_3', ['bar', 'baz', 'foo']]]
11
+
12
+ corpus = Documentally::Corpus.new(term_lists)
13
+
14
+ query = Documentally::Document.new('query', ['baz'])
15
+
16
+ corpus.search(query) # => ['doc_2']
17
+ ```
18
+
19
+ ## Installation
20
+
21
+ ```sh
22
+ gem install documentally
23
+ ```
@@ -0,0 +1,5 @@
1
+ module Documentally
2
+ end
3
+
4
+ require 'documentally/document'
5
+ require 'documentally/corpus'
@@ -0,0 +1,23 @@
1
+ class Documentally::Corpus
2
+ attr_reader :documents
3
+
4
+ def initialize(term_lists)
5
+ all_terms = term_lists.map(&:last).inject(&:+)
6
+ master_document = Documentally::Document.new('master', all_terms)
7
+
8
+ @documents = term_lists.map { |name, term_list| Documentally::Document.new(name, term_list) }
9
+ documents.each do |document|
10
+ document.normalize!(master_document)
11
+ end
12
+ end
13
+
14
+ def search(query, take: 1)
15
+ order_documents_by(query).take(take)
16
+ end
17
+
18
+ private
19
+
20
+ def order_documents_by(query)
21
+ documents.sort_by { |document| document.similarity(query) }.reverse
22
+ end
23
+ end
@@ -0,0 +1,40 @@
1
+ class Documentally::Document
2
+ attr_accessor :term_hash
3
+ attr_reader :name
4
+
5
+ def initialize(name, terms)
6
+ @name = name
7
+ @term_hash = Hash.new(0.0)
8
+
9
+ terms.each do |term|
10
+ @term_hash[term] += 1
11
+ end
12
+ end
13
+
14
+ def to_s
15
+ name.to_s
16
+ end
17
+
18
+ def terms
19
+ term_hash.keys
20
+ end
21
+
22
+ def frequency(term)
23
+ term_hash[term]
24
+ end
25
+
26
+ def ==(other)
27
+ union_of_terms = (terms + other.terms).uniq
28
+ union_of_terms.all? { |term| frequency(term) == other.frequency(term) }
29
+ end
30
+
31
+ def similarity(query)
32
+ terms.map { |term| frequency(term) * query.frequency(term) }.inject(&:+)
33
+ end
34
+
35
+ def normalize!(corpus)
36
+ terms.each do |term|
37
+ term_hash[term] /= corpus.frequency(term)
38
+ end
39
+ end
40
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: documentally
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Harry Schwartz
8
+ - Dan Honey
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-09 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description:
15
+ email: hello@harryrschwartz.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - lib/documentally.rb
22
+ - lib/documentally/corpus.rb
23
+ - lib/documentally/document.rb
24
+ homepage: http://github.com/hrs/documentally
25
+ licenses:
26
+ - GPL
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.2.2
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: Simple TF-IDF document search library.
48
+ test_files: []