autotag 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
File without changes
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new do |t|
4
+ t.libs << 'test'
5
+ end
6
+
7
+ desc "Run tests"
8
+ task :default => :test
Binary file
@@ -0,0 +1,34 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "autotag"
3
+ s.version = "1.0.0"
4
+ s.default_executable = "autotag"
5
+
6
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
7
+ s.authors = ["Matt Frisbie"]
8
+ s.date = %q{2013-01-13}
9
+ s.email = %q{msfrisbie@gmail.com}
10
+ #s.files = ["Rakefile", "lib/autotag.rb", "lib/autotag/extractor.rb", "bin/autotag"]
11
+ #s.test_files = ["test/test_autotag.rb"]
12
+ s.homepage = %q{http://rubygems.org/gems/autotag}
13
+ s.rubygems_version = %q{1.6.2}
14
+ s.summary = %q{autotag}
15
+ s.description = %q{Autotag content gem}
16
+
17
+ s.rubyforge_project = "autotag"
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+
23
+ s.require_paths = ["lib"]
24
+
25
+ if s.respond_to? :specification_version then
26
+ s.specification_version = 3
27
+
28
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
29
+ else
30
+ end
31
+ else
32
+ end
33
+ end
34
+
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # still need to package this as a binary executable
4
+
5
+ require 'autotag'
6
+ #puts Hola.hi(ARGV[0])
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'ai4r'
3
+ require 'redis'
4
+ require 'nokogiri'
5
+ require 'lingua/stemmer'
6
+ require 'open-uri'
7
+
8
+ require 'autotag/extractor'
9
+ require 'autotag/extractor/document'
10
+ require 'autotag/extractor/document/histogram'
11
+ require 'autotag/extractor/document/stem'
12
+ require 'autotag/extractor/document/term'
13
+ require 'autotag/extractor/document/textblock'
14
+ #require
15
+ require 'autotag/tagger'
16
+ # require 'autotag/tagger/'
17
+
18
+
19
+ module Autotag
20
+
21
+ #def self.test(param)
22
+ end
@@ -0,0 +1,53 @@
1
+ # -*- encoding: utf-8 -*-
2
+ module Autotag
3
+ # class << self
4
+
5
+ # # pass a url or document to extract text
6
+ # def Extractor(document,options)
7
+
8
+ # end
9
+ # end
10
+
11
+ module Extractor
12
+ # class Extractor
13
+
14
+ # REGEX CONSTANTS
15
+
16
+ # CONTRACTION
17
+ # selects all instances of english contractions
18
+ # could perhaps be reduced using special character wildcard,
19
+ # but then need to take into account things like 'R&D' being matched
20
+ # CNTR_REGEX = /(n’t)|(n't)|(’ll)|('ll)|(’ve)|('ve)|(’re)|('re)|(’s)|('s)|(’d)|('d)/
21
+
22
+ # TOTAL WHITESPACE
23
+ # selects strings that are composed entirely of whitespace
24
+ # TW_REGEX = /^\s*$/
25
+
26
+ # SPACE DELIMITED WORDS
27
+ # selects words of non-whitespace characters delimited by whitespace characters
28
+ # SDW_REGEX = /[\w-]+/
29
+
30
+ # NON-LETTER REGEX
31
+ # selects all characters that are not upcase/downcase letters
32
+ # NL_REGEX = /[^A-Za-z]/
33
+
34
+ # GLOBAL WHITESPACE REGEX
35
+ # selects all whitespace characters, including non-breaking space
36
+ # characters imported from an HTML -> latin1 conversion
37
+ # GW_REGEX = /\p{Z}/
38
+
39
+ # BLOCK DELIMITING WHITESPACE REGED
40
+ # selects continuous blocks of whitespace characters
41
+ # BDW_REGEX = (/\p{Z}+/)
42
+
43
+ # def initialize(str = "")
44
+ # @str = str
45
+ # end
46
+
47
+ # def test
48
+ # p "fuck you: #{@str}"
49
+ # p "test . str".gsub(GW_REGEX,'DERP')
50
+ # end
51
+ #def split_html
52
+ end
53
+ end
@@ -0,0 +1,69 @@
1
+ # -*- encoding: utf-8 -*-
2
+ class Autotag::Extractor::Document
3
+ attr_reader :url, :stems, :textblocks, :histogram
4
+
5
+ def initialize(url)
6
+
7
+ @stems = {}
8
+
9
+ @url = url
10
+ #@text =
11
+ @textblocks = split_html(Nokogiri::HTML(open(url)))
12
+
13
+ #@histogram = Autotag::Extractor::Histogram.new
14
+
15
+ s = Lingua::Stemmer.new(:language => "en")
16
+
17
+ @textblocks.each_with_index do |f,blockindex|
18
+ #f.stemwords.each_with_index do |g,wordindex|
19
+ f.words.each_with_index do |g,wordindex|
20
+ self.stem(g,[blockindex,wordindex],s)
21
+ end
22
+ end
23
+
24
+ #Nokogiri::Extractor::Histogram.new
25
+ #@terms =
26
+ #@textblocks = []
27
+
28
+ end
29
+
30
+ # def histogram
31
+ # @histogram
32
+ # end
33
+
34
+ def stem(term,coordinates,stemmer)
35
+ stem = stemmer.stem(term)
36
+ s = @stems[stem]
37
+ if !s.nil?#present?
38
+ s.add_term(term,coordinates)
39
+ else
40
+ @stems[stem] = Autotag::Extractor::Stem.new(stem,term,coordinates)
41
+ end
42
+ end
43
+
44
+ def split_html(node,charsize=0,wordsize=0)
45
+ arr = []
46
+ subset = node.children.remove
47
+ charsize += node.to_html.gsub("\n",'').size
48
+ subset.each do |f|
49
+ if f.class==Nokogiri::XML::Text
50
+ # pull out line breaks and tabs from text
51
+ # wordsize+=f.content.gsub("\n","").gsub("\t","").scan(/[\w-]+/).size
52
+ wordsize+=f.content.scan(/[\w-]+/).size
53
+ end
54
+ end
55
+ subset.each do |f|
56
+ if f.class==Nokogiri::XML::Text
57
+ if (f.content=~/^\s*$/).nil?
58
+ # this can be accomplished more efficiently
59
+ #arr << [f.content.gsub("\n","").gsub("\t",""),charsize,wordsize,wordsize.to_f/charsize.to_f]
60
+ #arr << [f.content.gsub(/(n’t)|(n't)|(’ll)|('ll)|(’ve)|('ve)|(’re)|('re)|(’s)|('s)|(’d)|('d)/,'' ), charsize,wordsize,wordsize.to_f/charsize.to_f]
61
+ arr << Autotag::Extractor::Textblock.new(f.content,charsize,wordsize)
62
+ end
63
+ else
64
+ arr += split_html(f,charsize,wordsize)
65
+ end
66
+ end
67
+ arr
68
+ end
69
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ class Autotag::Extractor::Histogram
3
+ # generate term histogram,
4
+ # stem histogram, both accessible by methods
5
+ def initialize()
6
+ @stems = {}
7
+ @terms = {}
8
+ end
9
+
10
+ def stem(stem)
11
+ if @stems[stem]
12
+ @stems[stem] += 1
13
+ else
14
+ @stems[stem] = 1
15
+ end
16
+ end
17
+
18
+ def term(term)
19
+ if @terms[term]
20
+ @terms[term] += 1
21
+ else
22
+ @terms[term] = 1
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # this class will contain an array of child words,
4
+ # each indexed into their location in the parent document
5
+
6
+ class Autotag::Extractor::Stem
7
+ attr_reader :stem, :terms
8
+
9
+ def initialize(stem,term=nil,coordinates=[])
10
+ @stem = stem
11
+ @terms = []
12
+ if !term.nil? && coordinates.any?
13
+ self.add_term(term,coordinates)
14
+ else
15
+ p "error! #{term} #{coordinates}"
16
+ end
17
+ end
18
+
19
+ def size
20
+ @terms.size
21
+ end
22
+
23
+ def add_term(term,coordinates)
24
+ if (t=find_term(term)).nil?
25
+ @terms << Autotag::Extractor::Term.new(term,coordinates)
26
+ else
27
+ t.merge(coordinates)
28
+ end
29
+ end
30
+
31
+ def find_term(term)
32
+ @terms.each do |f|
33
+ return f if term == f.to_s
34
+ end
35
+ nil
36
+ end
37
+
38
+ end
@@ -0,0 +1,18 @@
1
+ # -*- encoding: utf-8 -*-
2
+ class Autotag::Extractor::Term
3
+ attr_reader :term, :coordinates
4
+ # define location in parent documents
5
+ # define location in histogram?
6
+ def initialize(term,coordinates)
7
+ @term = term
8
+ @coordinates = [coordinates]
9
+ end
10
+
11
+ def to_s
12
+ return @term
13
+ end
14
+
15
+ def merge(coordinates)
16
+ @coordinates << coordinates
17
+ end
18
+ end
@@ -0,0 +1,37 @@
1
+ # -*- encoding: utf-8 -*-
2
+ class Autotag::Extractor::Textblock
3
+ attr_reader :size, :words
4
+ # size, HTML data
5
+
6
+ def initialize(str,charsize,wordsize)
7
+ # count the number of blocks of non-whitespace characters
8
+ @charsize = charsize
9
+ @wordsize = wordsize
10
+ @words = str.split(/\p{Z}+/).reject{|f| f.empty?}
11
+ @size = @words.size
12
+ end
13
+
14
+ def ratio
15
+ return @wordsize.to_f/@charsize.to_f
16
+ end
17
+
18
+ # def stemwords
19
+ def words
20
+ # s = Lingua::Stemmer.new(:language => "en")
21
+ #@words.map{|f| [s.stem(f.gsub(/[^A-Za-z0-9]/,'')),f]}
22
+ @words.map{|f| f.gsub(/[^A-Za-z0-9]/,'')}
23
+ end
24
+
25
+ def plaintext
26
+ @words.join(' ')
27
+ end
28
+
29
+ def [] (index)
30
+ @words[index]
31
+ end
32
+
33
+ # returns number of words in the block
34
+ # def size
35
+ # @size
36
+ # end
37
+ end
File without changes
@@ -0,0 +1,14 @@
1
+ require 'test/unit'
2
+ require 'autotag'
3
+
4
+ class AutotagTest < Test::Unit::TestCase
5
+ # def test_empty
6
+ # h = Autotag::Extractor.new#("")
7
+ # assert_equal "fuck you: empty", h.test
8
+ # end
9
+
10
+ def test_string
11
+ h = Autotag::Extractor.new("blah")
12
+ assert_equal "fuck you: blah", h.test
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: autotag
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Matt Frisbie
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-13 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Autotag content gem
15
+ email: msfrisbie@gmail.com
16
+ executables:
17
+ - autotag
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - README
22
+ - Rakefile
23
+ - autotag-0.0.1.gem
24
+ - autotag.gemspec
25
+ - bin/autotag
26
+ - lib/autotag.rb
27
+ - lib/autotag/extractor.rb
28
+ - lib/autotag/extractor/document.rb
29
+ - lib/autotag/extractor/document/histogram.rb
30
+ - lib/autotag/extractor/document/stem.rb
31
+ - lib/autotag/extractor/document/term.rb
32
+ - lib/autotag/extractor/document/textblock.rb
33
+ - lib/autotag/tagger.rb
34
+ - test/test_autotag.rb
35
+ homepage: http://rubygems.org/gems/autotag
36
+ licenses: []
37
+ post_install_message:
38
+ rdoc_options: []
39
+ require_paths:
40
+ - lib
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ required_rubygems_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubyforge_project: autotag
55
+ rubygems_version: 1.8.24
56
+ signing_key:
57
+ specification_version: 3
58
+ summary: autotag
59
+ test_files: []