inverted_index 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+ gem "nokogiri", ">= 1.5.0"
6
+ gem "hpricot", ">= 0.8.6"
7
+
8
+ # Add dependencies to develop your gem here.
9
+ # Include everything needed to run rake, tests, features, etc.
10
+ group :development do
11
+ gem "shoulda", ">= 0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "bundler", "~> 1.0.0"
14
+ gem "jeweler", "~> 1.8.3"
15
+ #gem "rcov", ">= 0"
16
+ end
17
+
data/Gemfile.lock ADDED
@@ -0,0 +1,31 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ hpricot (0.8.6)
6
+ jeweler (1.8.3)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rdoc
11
+ json (1.6.5)
12
+ nokogiri (1.5.0)
13
+ rake (0.9.2.2)
14
+ rdoc (3.12)
15
+ json (~> 1.4)
16
+ shoulda (3.0.1)
17
+ shoulda-context (~> 1.0.0)
18
+ shoulda-matchers (~> 1.0.0)
19
+ shoulda-context (1.0.0)
20
+ shoulda-matchers (1.0.0)
21
+
22
+ PLATFORMS
23
+ ruby
24
+
25
+ DEPENDENCIES
26
+ bundler (~> 1.0.0)
27
+ hpricot (>= 0.8.6)
28
+ jeweler (~> 1.8.3)
29
+ nokogiri (>= 1.5.0)
30
+ rdoc (~> 3.12)
31
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 sfigart
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,4 @@
1
+ = inverted_index
2
+
3
+ This is a set of classes to that allow an inverted index to be created
4
+ This is a project for a course and is not intended to be used in production system
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "inverted_index"
18
+ gem.homepage = "http://github.com/sfigart/inverted_index"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Inverted Index}
21
+ gem.description = %Q{Inverted Index implementation}
22
+ gem.email = "sfigart@gmail.com"
23
+ gem.authors = ["sfigart"]
24
+ # dependencies defined in Gemfile
25
+ gem.add_dependency 'nokogiri', '>=1.5.0'
26
+ gem.add_dependency 'hpricot', '>=0.8.6'
27
+ end
28
+ Jeweler::RubygemsDotOrgTasks.new
29
+
30
+ require 'rake/testtask'
31
+ Rake::TestTask.new(:test) do |test|
32
+ test.libs << 'lib' << 'test'
33
+ test.pattern = 'test/**/test_*.rb'
34
+ test.verbose = true
35
+ end
36
+
37
+ =begin
38
+ require 'rcov/rcovtask'
39
+ Rcov::RcovTask.new do |test|
40
+ test.libs << 'test'
41
+ test.pattern = 'test/**/test_*.rb'
42
+ test.verbose = true
43
+ test.rcov_opts << '--exclude "gems/*"'
44
+ end
45
+ =end
46
+
47
+ task :default => :test
48
+
49
+ require 'rdoc/task'
50
+ Rake::RDocTask.new do |rdoc|
51
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "inverted_index #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
@@ -0,0 +1,74 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "inverted_index"
8
+ s.version = "0.0.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["sfigart"]
12
+ s.date = "2012-03-17"
13
+ s.description = "Inverted Index implementation"
14
+ s.email = "sfigart@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "inverted_index.gemspec",
28
+ "lib/inverted_index.rb",
29
+ "lib/inverted_index/cleaner.rb",
30
+ "lib/inverted_index/parse.rb",
31
+ "lib/inverted_index/stopwords.rb",
32
+ "test/helper.rb",
33
+ "test/test_inverted_index.rb"
34
+ ]
35
+ s.homepage = "http://github.com/sfigart/inverted_index"
36
+ s.licenses = ["MIT"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = "1.8.17"
39
+ s.summary = "Inverted Index"
40
+
41
+ if s.respond_to? :specification_version then
42
+ s.specification_version = 3
43
+
44
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
45
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
46
+ s.add_runtime_dependency(%q<hpricot>, [">= 0.8.6"])
47
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
48
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
49
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
50
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.3"])
51
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
52
+ s.add_runtime_dependency(%q<hpricot>, [">= 0.8.6"])
53
+ else
54
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
55
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
56
+ s.add_dependency(%q<shoulda>, [">= 0"])
57
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
58
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
60
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
61
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
62
+ end
63
+ else
64
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
65
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
66
+ s.add_dependency(%q<shoulda>, [">= 0"])
67
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
68
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
70
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
71
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
72
+ end
73
+ end
74
+
@@ -0,0 +1,3 @@
1
+ require 'inverted_index/cleaner'
2
+ require 'inverted_index/parse'
3
+ require 'inverted_index/stopwords'
@@ -0,0 +1,36 @@
1
+ module InvertedIndex
2
+ class Cleaner
3
+ def self.clean(tokens, text='')
4
+ # To lowercase
5
+ tokens = tokens.each {|token| token.downcase}
6
+
7
+ # Remove stopwords
8
+ tokens = tokens - InvertedIndex::Stopwords.words
9
+
10
+ # Remove all non-word characters
11
+ words = []
12
+ tokens = tokens.each do |token|
13
+ word = token.gsub(/\W/,'')
14
+ words << word if !word.empty?
15
+ end
16
+ tokens = words
17
+
18
+ # TODO: Scan text for special text (e.g. dates, time)
19
+ # A date looks like /((january|february|march)\s\d,\s\d\d\d\d)/i
20
+ # A time looks like
21
+ # 00:00 # 00:00:00 # 00:00:00 a.m. # 00:00:00 p.m. # 00:00:00 pm
22
+ matches = text.scan(/(\d\d:\d\d(:\d\d)?(\s(a|p)\.?m\.?)?)/i)
23
+ matches.each {|match| tokens << match[0].downcase.strip}
24
+
25
+ # Remove all non-ascii words
26
+ ascii_terms = []
27
+ tokens.each {|token| ascii_terms << token if token.ascii_only?}
28
+ tokens = ascii_terms
29
+
30
+ # Stem
31
+ stemmed_terms = []
32
+ tokens.each {|token| stemmed_terms << token.stem.downcase if !token.stem.empty?}
33
+ tokens = stemmed_terms
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,42 @@
1
+ require 'hpricot'
2
+
3
+ module InvertedIndex
4
+ class Parse
5
+ attr_accessor :html, :doc, :body, :text, :tokens
6
+ def initialize(html)
7
+ @html = html
8
+ end
9
+
10
+ def parse
11
+ @doc = Hpricot(@html)
12
+
13
+ # Remove sections that shouldn't be indexed
14
+ @doc.search('head').remove
15
+ @doc.search('script').remove
16
+ @doc.search('style').remove
17
+ @doc.search('iframe').remove
18
+ @doc.search('embed').remove
19
+
20
+ # Get all text nodes
21
+ @text_nodes = (@doc/"body//*/text()")
22
+ @tokens = []
23
+
24
+ # Clean up each text node
25
+ @text_nodes.each do |node|
26
+ text = node.to_plain_text.strip
27
+ words = clean(text).split(' ')
28
+ words.each do |word|
29
+ @tokens << word unless word.empty?
30
+ end
31
+ end
32
+
33
+ # Return text separated by spaces
34
+ @text = @tokens.join(' ')
35
+ end
36
+
37
+ def clean(text)
38
+ # Replace new line and tabs with space
39
+ return text.gsub(/(\n|\t)/,' ').strip
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,49 @@
1
+ module InvertedIndex
2
+ class Stopwords
3
+ def self.words
4
+ [
5
+ "a", "about", "above", "across", "after", "again", "against", "all", "almost", "alone", "along",
6
+ "already", "also", "although", "always", "among", "an", "and", "another", "any", "anybody",
7
+ "anyone", "anything", "anywhere", "are", "area", "areas", "around", "as", "ask", "asked",
8
+ "asking", "asks", "at", "away", "b", "back", "backed", "backing", "backs", "be", "became",
9
+ "because", "become", "becomes", "been", "before", "began", "behind", "being", "beings",
10
+ "best", "better", "between", "big", "both", "but", "by", "c", "came", "can", "cannot", "case",
11
+ "cases", "certain", "certainly", "clear", "clearly", "come", "could", "d", "did", "differ",
12
+ "different", "differently", "do", "does", "done", "down", "down", "downed", "downing",
13
+ "downs", "during", "e", "each", "early", "either", "end", "ended", "ending", "ends", "enough",
14
+ "even", "evenly", "ever", "every", "everybody", "everyone", "everything", "everywhere", "f",
15
+ "face", "faces", "fact", "facts", "far", "felt", "few", "find", "finds", "first", "for", "four", "from",
16
+ "full", "fully", "further", "furthered", "furthering", "furthers", "g", "gave", "general",
17
+ "generally", "get", "gets", "give", "given", "gives", "go", "going", "good", "goods", "got",
18
+ "great", "greater", "greatest", "group", "grouped", "grouping", "groups", "h", "had", "has",
19
+ "have", "having", "he", "her", "here", "herself", "high", "high", "high", "higher", "highest",
20
+ "him", "himself", "his", "how", "however", "i", "if", "important", "in", "interest", "interested",
21
+ "interesting", "interests", "into", "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kind",
22
+ "knew", "know", "known", "knows", "l", "large", "largely", "last", "later", "latest", "least",
23
+ "less", "let", "lets", "like", "likely", "long", "longer", "longest", "m", "made", "make", "making",
24
+ "man", "many", "may", "me", "member", "members", "men", "might", "more", "most",
25
+ "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "necessary", "need", "needed",
26
+ "needing", "needs", "never", "new", "new", "newer", "newest", "next", "no", "nobody", "non",
27
+ "noone", "not", "nothing", "now", "nowhere", "number", "numbers", "o", "of", "off", "often",
28
+ "old", "older", "oldest", "on", "once", "one", "only", "open", "opened", "opening", "opens",
29
+ "or", "order", "ordered", "ordering", "orders", "other", "others", "our", "out", "over", "p",
30
+ "part", "parted", "parting", "parts", "per", "perhaps", "place", "places", "point", "pointed",
31
+ "pointing", "points", "possible", "present", "presented", "presenting", "presents", "problem",
32
+ "problems", "put", "puts", "q", "quite", "r", "rather", "really", "right", "right", "room",
33
+ "rooms", "s", "said", "same", "saw", "say", "says", "second", "seconds", "see", "seem",
34
+ "seemed", "seeming", "seems", "sees", "several", "shall", "she", "should", "show", "showed",
35
+ "showing", "shows", "side", "sides", "since", "small", "smaller", "smallest", "so", "some",
36
+ "somebody", "someone", "something", "somewhere", "state", "states", "still", "still", "such",
37
+ "sure", "t", "take", "taken", "than", "that", "the", "their", "them", "then", "there", "therefore",
38
+ "these", "they", "thing", "things", "think", "thinks", "this", "those", "though", "thought",
39
+ "thoughts", "three", "through", "thus", "to", "today", "together", "too", "took", "toward",
40
+ "turn", "turned", "turning", "turns", "two", "u", "under", "until", "up", "upon", "us", "use",
41
+ "used", "uses", "v", "very", "w", "want", "wanted", "wanting", "wants", "was", "way", "ways",
42
+ "we", "well", "wells", "went", "were", "what", "when", "where", "whether", "which", "while",
43
+ "who", "whole", "whose", "why", "will", "with", "within", "without", "work", "worked",
44
+ "working", "works", "would", "x", "y", "year", "years", "yet", "you", "young", "younger",
45
+ "youngest", "your", "yours", "z"
46
+ ]
47
+ end
48
+ end
49
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'inverted_index'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestInvertedIndex < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: inverted_index
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - sfigart
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &70366066585320 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70366066585320
25
+ - !ruby/object:Gem::Dependency
26
+ name: hpricot
27
+ requirement: &70366066588780 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: 0.8.6
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70366066588780
36
+ - !ruby/object:Gem::Dependency
37
+ name: shoulda
38
+ requirement: &70366065868620 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70366065868620
47
+ - !ruby/object:Gem::Dependency
48
+ name: rdoc
49
+ requirement: &70366065870080 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '3.12'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70366065870080
58
+ - !ruby/object:Gem::Dependency
59
+ name: bundler
60
+ requirement: &70366065871520 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 1.0.0
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *70366065871520
69
+ - !ruby/object:Gem::Dependency
70
+ name: jeweler
71
+ requirement: &70366065873460 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 1.8.3
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *70366065873460
80
+ - !ruby/object:Gem::Dependency
81
+ name: nokogiri
82
+ requirement: &70366065857600 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: 1.5.0
88
+ type: :runtime
89
+ prerelease: false
90
+ version_requirements: *70366065857600
91
+ - !ruby/object:Gem::Dependency
92
+ name: hpricot
93
+ requirement: &70366065766900 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: 0.8.6
99
+ type: :runtime
100
+ prerelease: false
101
+ version_requirements: *70366065766900
102
+ description: Inverted Index implementation
103
+ email: sfigart@gmail.com
104
+ executables: []
105
+ extensions: []
106
+ extra_rdoc_files:
107
+ - LICENSE.txt
108
+ - README.rdoc
109
+ files:
110
+ - .document
111
+ - Gemfile
112
+ - Gemfile.lock
113
+ - LICENSE.txt
114
+ - README.rdoc
115
+ - Rakefile
116
+ - VERSION
117
+ - inverted_index.gemspec
118
+ - lib/inverted_index.rb
119
+ - lib/inverted_index/cleaner.rb
120
+ - lib/inverted_index/parse.rb
121
+ - lib/inverted_index/stopwords.rb
122
+ - test/helper.rb
123
+ - test/test_inverted_index.rb
124
+ homepage: http://github.com/sfigart/inverted_index
125
+ licenses:
126
+ - MIT
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ segments:
138
+ - 0
139
+ hash: -747465224356148538
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ none: false
142
+ requirements:
143
+ - - ! '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ requirements: []
147
+ rubyforge_project:
148
+ rubygems_version: 1.8.17
149
+ signing_key:
150
+ specification_version: 3
151
+ summary: Inverted Index
152
+ test_files: []