documentally 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f7ba9de32ab5018f410bf3e823c9f8a219b52fc0
4
+ data.tar.gz: 723aba4f6bc91aaa0b8c189b4026af67b5bee329
5
+ SHA512:
6
+ metadata.gz: 1ed6ff89c2a3866d00289a21592b03ce14722730dbb1709ceb3a4d14549bfe3e3e2c74cfa71a2518b57c8944f27bf821766c5425fd34b231f85e1020a973832a
7
+ data.tar.gz: 7081161224242150f5d9cf2a15d31edfeb765bb6bdc8150e9328fb3316e9995bf8b2885e59f2aae438e9bd79909f237aa768eec4008f24504a50dddf6859f527
data/README.md ADDED
@@ -0,0 +1,23 @@
1
+ # Documentally
2
+
3
+ Documentally uses [TF-IDF](http://en.wikipedia.org/wiki/tf-idf) to allow you to easily search and compare text documents.
4
+
5
+ It's still under development, so the documentation is basically just the tests right now, but an example use would be:
6
+
7
+ ```ruby
8
+ term_lists = [['doc_1', ['foo', 'foo', 'bar']],
9
+ ['doc_2', ['bar', 'baz', 'baz']],
10
+ ['doc_3', ['bar', 'baz', 'foo']]]
11
+
12
+ corpus = Documentally::Corpus.new(term_lists)
13
+
14
+ query = Documentally::Document.new('query', ['baz'])
15
+
16
+ corpus.search(query) # => ['doc_2']
17
+ ```
18
+
19
+ ## Installation
20
+
21
+ ```sh
22
+ gem install documentally
23
+ ```
@@ -0,0 +1,5 @@
1
+ module Documentally
2
+ end
3
+
4
+ require 'documentally/document'
5
+ require 'documentally/corpus'
@@ -0,0 +1,23 @@
1
+ class Documentally::Corpus
2
+ attr_reader :documents
3
+
4
+ def initialize(term_lists)
5
+ all_terms = term_lists.map(&:last).inject(&:+)
6
+ master_document = Documentally::Document.new('master', all_terms)
7
+
8
+ @documents = term_lists.map { |name, term_list| Documentally::Document.new(name, term_list) }
9
+ documents.each do |document|
10
+ document.normalize!(master_document)
11
+ end
12
+ end
13
+
14
+ def search(query, take: 1)
15
+ order_documents_by(query).take(take)
16
+ end
17
+
18
+ private
19
+
20
+ def order_documents_by(query)
21
+ documents.sort_by { |document| document.similarity(query) }.reverse
22
+ end
23
+ end
@@ -0,0 +1,40 @@
1
+ class Documentally::Document
2
+ attr_accessor :term_hash
3
+ attr_reader :name
4
+
5
+ def initialize(name, terms)
6
+ @name = name
7
+ @term_hash = Hash.new(0.0)
8
+
9
+ terms.each do |term|
10
+ @term_hash[term] += 1
11
+ end
12
+ end
13
+
14
+ def to_s
15
+ name.to_s
16
+ end
17
+
18
+ def terms
19
+ term_hash.keys
20
+ end
21
+
22
+ def frequency(term)
23
+ term_hash[term]
24
+ end
25
+
26
+ def ==(other)
27
+ union_of_terms = (terms + other.terms).uniq
28
+ union_of_terms.all? { |term| frequency(term) == other.frequency(term) }
29
+ end
30
+
31
+ def similarity(query)
32
+ terms.map { |term| frequency(term) * query.frequency(term) }.inject(&:+)
33
+ end
34
+
35
+ def normalize!(corpus)
36
+ terms.each do |term|
37
+ term_hash[term] /= corpus.frequency(term)
38
+ end
39
+ end
40
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: documentally
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Harry Schwartz
8
+ - Dan Honey
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-09 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description:
15
+ email: hello@harryrschwartz.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - lib/documentally.rb
22
+ - lib/documentally/corpus.rb
23
+ - lib/documentally/document.rb
24
+ homepage: http://github.com/hrs/documentally
25
+ licenses:
26
+ - GPL
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.2.2
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: Simple TF-IDF document search library.
48
+ test_files: []