inverted_index 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+ gem "nokogiri", ">= 1.5.0"
6
+ gem "hpricot", ">= 0.8.6"
7
+
8
+ # Add dependencies to develop your gem here.
9
+ # Include everything needed to run rake, tests, features, etc.
10
+ group :development do
11
+ gem "shoulda", ">= 0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "bundler", "~> 1.0.0"
14
+ gem "jeweler", "~> 1.8.3"
15
+ #gem "rcov", ">= 0"
16
+ end
17
+
data/Gemfile.lock ADDED
@@ -0,0 +1,31 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ hpricot (0.8.6)
6
+ jeweler (1.8.3)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rdoc
11
+ json (1.6.5)
12
+ nokogiri (1.5.0)
13
+ rake (0.9.2.2)
14
+ rdoc (3.12)
15
+ json (~> 1.4)
16
+ shoulda (3.0.1)
17
+ shoulda-context (~> 1.0.0)
18
+ shoulda-matchers (~> 1.0.0)
19
+ shoulda-context (1.0.0)
20
+ shoulda-matchers (1.0.0)
21
+
22
+ PLATFORMS
23
+ ruby
24
+
25
+ DEPENDENCIES
26
+ bundler (~> 1.0.0)
27
+ hpricot (>= 0.8.6)
28
+ jeweler (~> 1.8.3)
29
+ nokogiri (>= 1.5.0)
30
+ rdoc (~> 3.12)
31
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 sfigart
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,4 @@
1
+ = inverted_index
2
+
3
+ This is a set of classes to that allow an inverted index to be created
4
+ This is a project for a course and is not intended to be used in production system
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "inverted_index"
18
+ gem.homepage = "http://github.com/sfigart/inverted_index"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Inverted Index}
21
+ gem.description = %Q{Inverted Index implementation}
22
+ gem.email = "sfigart@gmail.com"
23
+ gem.authors = ["sfigart"]
24
+ # dependencies defined in Gemfile
25
+ gem.add_dependency 'nokogiri', '>=1.5.0'
26
+ gem.add_dependency 'hpricot', '>=0.8.6'
27
+ end
28
+ Jeweler::RubygemsDotOrgTasks.new
29
+
30
+ require 'rake/testtask'
31
+ Rake::TestTask.new(:test) do |test|
32
+ test.libs << 'lib' << 'test'
33
+ test.pattern = 'test/**/test_*.rb'
34
+ test.verbose = true
35
+ end
36
+
37
+ =begin
38
+ require 'rcov/rcovtask'
39
+ Rcov::RcovTask.new do |test|
40
+ test.libs << 'test'
41
+ test.pattern = 'test/**/test_*.rb'
42
+ test.verbose = true
43
+ test.rcov_opts << '--exclude "gems/*"'
44
+ end
45
+ =end
46
+
47
+ task :default => :test
48
+
49
+ require 'rdoc/task'
50
+ Rake::RDocTask.new do |rdoc|
51
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "inverted_index #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
@@ -0,0 +1,74 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "inverted_index"
8
+ s.version = "0.0.3"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["sfigart"]
12
+ s.date = "2012-03-17"
13
+ s.description = "Inverted Index implementation"
14
+ s.email = "sfigart@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "inverted_index.gemspec",
28
+ "lib/inverted_index.rb",
29
+ "lib/inverted_index/cleaner.rb",
30
+ "lib/inverted_index/parse.rb",
31
+ "lib/inverted_index/stopwords.rb",
32
+ "test/helper.rb",
33
+ "test/test_inverted_index.rb"
34
+ ]
35
+ s.homepage = "http://github.com/sfigart/inverted_index"
36
+ s.licenses = ["MIT"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = "1.8.17"
39
+ s.summary = "Inverted Index"
40
+
41
+ if s.respond_to? :specification_version then
42
+ s.specification_version = 3
43
+
44
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
45
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
46
+ s.add_runtime_dependency(%q<hpricot>, [">= 0.8.6"])
47
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
48
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
49
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
50
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.3"])
51
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.0"])
52
+ s.add_runtime_dependency(%q<hpricot>, [">= 0.8.6"])
53
+ else
54
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
55
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
56
+ s.add_dependency(%q<shoulda>, [">= 0"])
57
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
58
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
60
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
61
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
62
+ end
63
+ else
64
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
65
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
66
+ s.add_dependency(%q<shoulda>, [">= 0"])
67
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
68
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
70
+ s.add_dependency(%q<nokogiri>, [">= 1.5.0"])
71
+ s.add_dependency(%q<hpricot>, [">= 0.8.6"])
72
+ end
73
+ end
74
+
@@ -0,0 +1,3 @@
1
+ require 'inverted_index/cleaner'
2
+ require 'inverted_index/parse'
3
+ require 'inverted_index/stopwords'
@@ -0,0 +1,36 @@
1
+ module InvertedIndex
2
+ class Cleaner
3
+ def self.clean(tokens, text='')
4
+ # To lowercase
5
+ tokens = tokens.each {|token| token.downcase}
6
+
7
+ # Remove stopwords
8
+ tokens = tokens - InvertedIndex::Stopwords.words
9
+
10
+ # Remove all non-word characters
11
+ words = []
12
+ tokens = tokens.each do |token|
13
+ word = token.gsub(/\W/,'')
14
+ words << word if !word.empty?
15
+ end
16
+ tokens = words
17
+
18
+ # TODO: Scan text for special text (e.g. dates, time)
19
+ # A date looks like /((january|february|march)\s\d,\s\d\d\d\d)/i
20
+ # A time looks like
21
+ # 00:00 # 00:00:00 # 00:00:00 a.m. # 00:00:00 p.m. # 00:00:00 pm
22
+ matches = text.scan(/(\d\d:\d\d(:\d\d)?(\s(a|p)\.?m\.?)?)/i)
23
+ matches.each {|match| tokens << match[0].downcase.strip}
24
+
25
+ # Remove all non-ascii words
26
+ ascii_terms = []
27
+ tokens.each {|token| ascii_terms << token if token.ascii_only?}
28
+ tokens = ascii_terms
29
+
30
+ # Stem
31
+ stemmed_terms = []
32
+ tokens.each {|token| stemmed_terms << token.stem.downcase if !token.stem.empty?}
33
+ tokens = stemmed_terms
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,42 @@
1
+ require 'hpricot'
2
+
3
+ module InvertedIndex
4
+ class Parse
5
+ attr_accessor :html, :doc, :body, :text, :tokens
6
+ def initialize(html)
7
+ @html = html
8
+ end
9
+
10
+ def parse
11
+ @doc = Hpricot(@html)
12
+
13
+ # Remove sections that shouldn't be indexed
14
+ @doc.search('head').remove
15
+ @doc.search('script').remove
16
+ @doc.search('style').remove
17
+ @doc.search('iframe').remove
18
+ @doc.search('embed').remove
19
+
20
+ # Get all text nodes
21
+ @text_nodes = (@doc/"body//*/text()")
22
+ @tokens = []
23
+
24
+ # Clean up each text node
25
+ @text_nodes.each do |node|
26
+ text = node.to_plain_text.strip
27
+ words = clean(text).split(' ')
28
+ words.each do |word|
29
+ @tokens << word unless word.empty?
30
+ end
31
+ end
32
+
33
+ # Return text separated by spaces
34
+ @text = @tokens.join(' ')
35
+ end
36
+
37
+ def clean(text)
38
+ # Replace new line and tabs with space
39
+ return text.gsub(/(\n|\t)/,' ').strip
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,49 @@
1
+ module InvertedIndex
2
+ class Stopwords
3
+ def self.words
4
+ [
5
+ "a", "about", "above", "across", "after", "again", "against", "all", "almost", "alone", "along",
6
+ "already", "also", "although", "always", "among", "an", "and", "another", "any", "anybody",
7
+ "anyone", "anything", "anywhere", "are", "area", "areas", "around", "as", "ask", "asked",
8
+ "asking", "asks", "at", "away", "b", "back", "backed", "backing", "backs", "be", "became",
9
+ "because", "become", "becomes", "been", "before", "began", "behind", "being", "beings",
10
+ "best", "better", "between", "big", "both", "but", "by", "c", "came", "can", "cannot", "case",
11
+ "cases", "certain", "certainly", "clear", "clearly", "come", "could", "d", "did", "differ",
12
+ "different", "differently", "do", "does", "done", "down", "down", "downed", "downing",
13
+ "downs", "during", "e", "each", "early", "either", "end", "ended", "ending", "ends", "enough",
14
+ "even", "evenly", "ever", "every", "everybody", "everyone", "everything", "everywhere", "f",
15
+ "face", "faces", "fact", "facts", "far", "felt", "few", "find", "finds", "first", "for", "four", "from",
16
+ "full", "fully", "further", "furthered", "furthering", "furthers", "g", "gave", "general",
17
+ "generally", "get", "gets", "give", "given", "gives", "go", "going", "good", "goods", "got",
18
+ "great", "greater", "greatest", "group", "grouped", "grouping", "groups", "h", "had", "has",
19
+ "have", "having", "he", "her", "here", "herself", "high", "high", "high", "higher", "highest",
20
+ "him", "himself", "his", "how", "however", "i", "if", "important", "in", "interest", "interested",
21
+ "interesting", "interests", "into", "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kind",
22
+ "knew", "know", "known", "knows", "l", "large", "largely", "last", "later", "latest", "least",
23
+ "less", "let", "lets", "like", "likely", "long", "longer", "longest", "m", "made", "make", "making",
24
+ "man", "many", "may", "me", "member", "members", "men", "might", "more", "most",
25
+ "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "necessary", "need", "needed",
26
+ "needing", "needs", "never", "new", "new", "newer", "newest", "next", "no", "nobody", "non",
27
+ "noone", "not", "nothing", "now", "nowhere", "number", "numbers", "o", "of", "off", "often",
28
+ "old", "older", "oldest", "on", "once", "one", "only", "open", "opened", "opening", "opens",
29
+ "or", "order", "ordered", "ordering", "orders", "other", "others", "our", "out", "over", "p",
30
+ "part", "parted", "parting", "parts", "per", "perhaps", "place", "places", "point", "pointed",
31
+ "pointing", "points", "possible", "present", "presented", "presenting", "presents", "problem",
32
+ "problems", "put", "puts", "q", "quite", "r", "rather", "really", "right", "right", "room",
33
+ "rooms", "s", "said", "same", "saw", "say", "says", "second", "seconds", "see", "seem",
34
+ "seemed", "seeming", "seems", "sees", "several", "shall", "she", "should", "show", "showed",
35
+ "showing", "shows", "side", "sides", "since", "small", "smaller", "smallest", "so", "some",
36
+ "somebody", "someone", "something", "somewhere", "state", "states", "still", "still", "such",
37
+ "sure", "t", "take", "taken", "than", "that", "the", "their", "them", "then", "there", "therefore",
38
+ "these", "they", "thing", "things", "think", "thinks", "this", "those", "though", "thought",
39
+ "thoughts", "three", "through", "thus", "to", "today", "together", "too", "took", "toward",
40
+ "turn", "turned", "turning", "turns", "two", "u", "under", "until", "up", "upon", "us", "use",
41
+ "used", "uses", "v", "very", "w", "want", "wanted", "wanting", "wants", "was", "way", "ways",
42
+ "we", "well", "wells", "went", "were", "what", "when", "where", "whether", "which", "while",
43
+ "who", "whole", "whose", "why", "will", "with", "within", "without", "work", "worked",
44
+ "working", "works", "would", "x", "y", "year", "years", "yet", "you", "young", "younger",
45
+ "youngest", "your", "yours", "z"
46
+ ]
47
+ end
48
+ end
49
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'inverted_index'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestInvertedIndex < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: inverted_index
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - sfigart
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-03-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &70366066585320 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.5.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70366066585320
25
+ - !ruby/object:Gem::Dependency
26
+ name: hpricot
27
+ requirement: &70366066588780 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: 0.8.6
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70366066588780
36
+ - !ruby/object:Gem::Dependency
37
+ name: shoulda
38
+ requirement: &70366065868620 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70366065868620
47
+ - !ruby/object:Gem::Dependency
48
+ name: rdoc
49
+ requirement: &70366065870080 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '3.12'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70366065870080
58
+ - !ruby/object:Gem::Dependency
59
+ name: bundler
60
+ requirement: &70366065871520 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 1.0.0
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *70366065871520
69
+ - !ruby/object:Gem::Dependency
70
+ name: jeweler
71
+ requirement: &70366065873460 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 1.8.3
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *70366065873460
80
+ - !ruby/object:Gem::Dependency
81
+ name: nokogiri
82
+ requirement: &70366065857600 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: 1.5.0
88
+ type: :runtime
89
+ prerelease: false
90
+ version_requirements: *70366065857600
91
+ - !ruby/object:Gem::Dependency
92
+ name: hpricot
93
+ requirement: &70366065766900 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: 0.8.6
99
+ type: :runtime
100
+ prerelease: false
101
+ version_requirements: *70366065766900
102
+ description: Inverted Index implementation
103
+ email: sfigart@gmail.com
104
+ executables: []
105
+ extensions: []
106
+ extra_rdoc_files:
107
+ - LICENSE.txt
108
+ - README.rdoc
109
+ files:
110
+ - .document
111
+ - Gemfile
112
+ - Gemfile.lock
113
+ - LICENSE.txt
114
+ - README.rdoc
115
+ - Rakefile
116
+ - VERSION
117
+ - inverted_index.gemspec
118
+ - lib/inverted_index.rb
119
+ - lib/inverted_index/cleaner.rb
120
+ - lib/inverted_index/parse.rb
121
+ - lib/inverted_index/stopwords.rb
122
+ - test/helper.rb
123
+ - test/test_inverted_index.rb
124
+ homepage: http://github.com/sfigart/inverted_index
125
+ licenses:
126
+ - MIT
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ none: false
133
+ requirements:
134
+ - - ! '>='
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ segments:
138
+ - 0
139
+ hash: -747465224356148538
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ none: false
142
+ requirements:
143
+ - - ! '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ requirements: []
147
+ rubyforge_project:
148
+ rubygems_version: 1.8.17
149
+ signing_key:
150
+ specification_version: 3
151
+ summary: Inverted Index
152
+ test_files: []