autotag 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +0 -0
- data/Rakefile +8 -0
- data/autotag-0.0.1.gem +0 -0
- data/autotag.gemspec +34 -0
- data/bin/autotag +6 -0
- data/lib/autotag.rb +22 -0
- data/lib/autotag/extractor.rb +53 -0
- data/lib/autotag/extractor/document.rb +69 -0
- data/lib/autotag/extractor/document/histogram.rb +26 -0
- data/lib/autotag/extractor/document/stem.rb +38 -0
- data/lib/autotag/extractor/document/term.rb +18 -0
- data/lib/autotag/extractor/document/textblock.rb +37 -0
- data/lib/autotag/tagger.rb +0 -0
- data/test/test_autotag.rb +14 -0
- metadata +59 -0
data/README
ADDED
File without changes
|
data/Rakefile
ADDED
data/autotag-0.0.1.gem
ADDED
Binary file
|
data/autotag.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "autotag"
|
3
|
+
s.version = "1.0.0"
|
4
|
+
s.default_executable = "autotag"
|
5
|
+
|
6
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
7
|
+
s.authors = ["Matt Frisbie"]
|
8
|
+
s.date = %q{2013-01-13}
|
9
|
+
s.email = %q{msfrisbie@gmail.com}
|
10
|
+
#s.files = ["Rakefile", "lib/autotag.rb", "lib/autotag/extractor.rb", "bin/autotag"]
|
11
|
+
#s.test_files = ["test/test_autotag.rb"]
|
12
|
+
s.homepage = %q{http://rubygems.org/gems/autotag}
|
13
|
+
s.rubygems_version = %q{1.6.2}
|
14
|
+
s.summary = %q{autotag}
|
15
|
+
s.description = %q{Autotag content gem}
|
16
|
+
|
17
|
+
s.rubyforge_project = "autotag"
|
18
|
+
|
19
|
+
s.files = `git ls-files`.split("\n")
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
+
|
23
|
+
s.require_paths = ["lib"]
|
24
|
+
|
25
|
+
if s.respond_to? :specification_version then
|
26
|
+
s.specification_version = 3
|
27
|
+
|
28
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
29
|
+
else
|
30
|
+
end
|
31
|
+
else
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
data/bin/autotag
ADDED
data/lib/autotag.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require 'ai4r'
|
3
|
+
require 'redis'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'lingua/stemmer'
|
6
|
+
require 'open-uri'
|
7
|
+
|
8
|
+
require 'autotag/extractor'
|
9
|
+
require 'autotag/extractor/document'
|
10
|
+
require 'autotag/extractor/document/histogram'
|
11
|
+
require 'autotag/extractor/document/stem'
|
12
|
+
require 'autotag/extractor/document/term'
|
13
|
+
require 'autotag/extractor/document/textblock'
|
14
|
+
#require
|
15
|
+
require 'autotag/tagger'
|
16
|
+
# require 'autotag/tagger/'
|
17
|
+
|
18
|
+
|
19
|
+
module Autotag
|
20
|
+
|
21
|
+
#def self.test(param)
|
22
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
module Autotag
|
3
|
+
# class << self
|
4
|
+
|
5
|
+
# # pass a url or document to extract text
|
6
|
+
# def Extractor(document,options)
|
7
|
+
|
8
|
+
# end
|
9
|
+
# end
|
10
|
+
|
11
|
+
module Extractor
|
12
|
+
# class Extractor
|
13
|
+
|
14
|
+
# REGEX CONSTANTS
|
15
|
+
|
16
|
+
# CONTRACTION
|
17
|
+
# selects all instances of english contractions
|
18
|
+
# could perhaps be reduced using special character wildcard,
|
19
|
+
# but then need to take into account things like 'R&D' being matched
|
20
|
+
# CNTR_REGEX = /(n’t)|(n't)|(’ll)|('ll)|(’ve)|('ve)|(’re)|('re)|(’s)|('s)|(’d)|('d)/
|
21
|
+
|
22
|
+
# TOTAL WHITESPACE
|
23
|
+
# selects strings that are composed entirely of whitespace
|
24
|
+
# TW_REGEX = /^\s*$/
|
25
|
+
|
26
|
+
# SPACE DELIMITED WORDS
|
27
|
+
# selects words of non-whitespace characters delimited by whitespace characters
|
28
|
+
# SDW_REGEX = /[\w-]+/
|
29
|
+
|
30
|
+
# NON-LETTER REGEX
|
31
|
+
# selects all characters that are not upcase/downcase letters
|
32
|
+
# NL_REGEX = /[^A-Za-z]/
|
33
|
+
|
34
|
+
# GLOBAL WHITESPACE REGEX
|
35
|
+
# selects all whitespace characters, including non-breaking space
|
36
|
+
# characters imported from an HTML -> latin1 conversion
|
37
|
+
# GW_REGEX = /\p{Z}/
|
38
|
+
|
39
|
+
# BLOCK DELIMITING WHITESPACE REGED
|
40
|
+
# selects continuous blocks of whitespace characters
|
41
|
+
# BDW_REGEX = (/\p{Z}+/)
|
42
|
+
|
43
|
+
# def initialize(str = "")
|
44
|
+
# @str = str
|
45
|
+
# end
|
46
|
+
|
47
|
+
# def test
|
48
|
+
# p "fuck you: #{@str}"
|
49
|
+
# p "test . str".gsub(GW_REGEX,'DERP')
|
50
|
+
# end
|
51
|
+
#def split_html
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
class Autotag::Extractor::Document
|
3
|
+
attr_reader :url, :stems, :textblocks, :histogram
|
4
|
+
|
5
|
+
def initialize(url)
|
6
|
+
|
7
|
+
@stems = {}
|
8
|
+
|
9
|
+
@url = url
|
10
|
+
#@text =
|
11
|
+
@textblocks = split_html(Nokogiri::HTML(open(url)))
|
12
|
+
|
13
|
+
#@histogram = Autotag::Extractor::Histogram.new
|
14
|
+
|
15
|
+
s = Lingua::Stemmer.new(:language => "en")
|
16
|
+
|
17
|
+
@textblocks.each_with_index do |f,blockindex|
|
18
|
+
#f.stemwords.each_with_index do |g,wordindex|
|
19
|
+
f.words.each_with_index do |g,wordindex|
|
20
|
+
self.stem(g,[blockindex,wordindex],s)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
#Nokogiri::Extractor::Histogram.new
|
25
|
+
#@terms =
|
26
|
+
#@textblocks = []
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
# def histogram
|
31
|
+
# @histogram
|
32
|
+
# end
|
33
|
+
|
34
|
+
def stem(term,coordinates,stemmer)
|
35
|
+
stem = stemmer.stem(term)
|
36
|
+
s = @stems[stem]
|
37
|
+
if !s.nil?#present?
|
38
|
+
s.add_term(term,coordinates)
|
39
|
+
else
|
40
|
+
@stems[stem] = Autotag::Extractor::Stem.new(stem,term,coordinates)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def split_html(node,charsize=0,wordsize=0)
|
45
|
+
arr = []
|
46
|
+
subset = node.children.remove
|
47
|
+
charsize += node.to_html.gsub("\n",'').size
|
48
|
+
subset.each do |f|
|
49
|
+
if f.class==Nokogiri::XML::Text
|
50
|
+
# pull out line breaks and tabs from text
|
51
|
+
# wordsize+=f.content.gsub("\n","").gsub("\t","").scan(/[\w-]+/).size
|
52
|
+
wordsize+=f.content.scan(/[\w-]+/).size
|
53
|
+
end
|
54
|
+
end
|
55
|
+
subset.each do |f|
|
56
|
+
if f.class==Nokogiri::XML::Text
|
57
|
+
if (f.content=~/^\s*$/).nil?
|
58
|
+
# this can be accomplished more efficiently
|
59
|
+
#arr << [f.content.gsub("\n","").gsub("\t",""),charsize,wordsize,wordsize.to_f/charsize.to_f]
|
60
|
+
#arr << [f.content.gsub(/(n’t)|(n't)|(’ll)|('ll)|(’ve)|('ve)|(’re)|('re)|(’s)|('s)|(’d)|('d)/,'' ), charsize,wordsize,wordsize.to_f/charsize.to_f]
|
61
|
+
arr << Autotag::Extractor::Textblock.new(f.content,charsize,wordsize)
|
62
|
+
end
|
63
|
+
else
|
64
|
+
arr += split_html(f,charsize,wordsize)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
arr
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
class Autotag::Extractor::Histogram
|
3
|
+
# generate term histogram,
|
4
|
+
# stem histogram, both accessible by methods
|
5
|
+
def initialize()
|
6
|
+
@stems = {}
|
7
|
+
@terms = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def stem(stem)
|
11
|
+
if @stems[stem]
|
12
|
+
@stems[stem] += 1
|
13
|
+
else
|
14
|
+
@stems[stem] = 1
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def term(term)
|
19
|
+
if @terms[term]
|
20
|
+
@terms[term] += 1
|
21
|
+
else
|
22
|
+
@terms[term] = 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# this class will contain an array of child words,
|
4
|
+
# each indexed into their location in the parent document
|
5
|
+
|
6
|
+
class Autotag::Extractor::Stem
|
7
|
+
attr_reader :stem, :terms
|
8
|
+
|
9
|
+
def initialize(stem,term=nil,coordinates=[])
|
10
|
+
@stem = stem
|
11
|
+
@terms = []
|
12
|
+
if !term.nil? && coordinates.any?
|
13
|
+
self.add_term(term,coordinates)
|
14
|
+
else
|
15
|
+
p "error! #{term} #{coordinates}"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def size
|
20
|
+
@terms.size
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_term(term,coordinates)
|
24
|
+
if (t=find_term(term)).nil?
|
25
|
+
@terms << Autotag::Extractor::Term.new(term,coordinates)
|
26
|
+
else
|
27
|
+
t.merge(coordinates)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def find_term(term)
|
32
|
+
@terms.each do |f|
|
33
|
+
return f if term == f.to_s
|
34
|
+
end
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
class Autotag::Extractor::Term
|
3
|
+
attr_reader :term, :coordinates
|
4
|
+
# define location in parent documents
|
5
|
+
# define location in histogram?
|
6
|
+
def initialize(term,coordinates)
|
7
|
+
@term = term
|
8
|
+
@coordinates = [coordinates]
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
return @term
|
13
|
+
end
|
14
|
+
|
15
|
+
def merge(coordinates)
|
16
|
+
@coordinates << coordinates
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
class Autotag::Extractor::Textblock
|
3
|
+
attr_reader :size, :words
|
4
|
+
# size, HTML data
|
5
|
+
|
6
|
+
def initialize(str,charsize,wordsize)
|
7
|
+
# count the number of blocks of non-whitespace characters
|
8
|
+
@charsize = charsize
|
9
|
+
@wordsize = wordsize
|
10
|
+
@words = str.split(/\p{Z}+/).reject{|f| f.empty?}
|
11
|
+
@size = @words.size
|
12
|
+
end
|
13
|
+
|
14
|
+
def ratio
|
15
|
+
return @wordsize.to_f/@charsize.to_f
|
16
|
+
end
|
17
|
+
|
18
|
+
# def stemwords
|
19
|
+
def words
|
20
|
+
# s = Lingua::Stemmer.new(:language => "en")
|
21
|
+
#@words.map{|f| [s.stem(f.gsub(/[^A-Za-z0-9]/,'')),f]}
|
22
|
+
@words.map{|f| f.gsub(/[^A-Za-z0-9]/,'')}
|
23
|
+
end
|
24
|
+
|
25
|
+
def plaintext
|
26
|
+
@words.join(' ')
|
27
|
+
end
|
28
|
+
|
29
|
+
def [] (index)
|
30
|
+
@words[index]
|
31
|
+
end
|
32
|
+
|
33
|
+
# returns number of words in the block
|
34
|
+
# def size
|
35
|
+
# @size
|
36
|
+
# end
|
37
|
+
end
|
File without changes
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'autotag'
|
3
|
+
|
4
|
+
class AutotagTest < Test::Unit::TestCase
|
5
|
+
# def test_empty
|
6
|
+
# h = Autotag::Extractor.new#("")
|
7
|
+
# assert_equal "fuck you: empty", h.test
|
8
|
+
# end
|
9
|
+
|
10
|
+
def test_string
|
11
|
+
h = Autotag::Extractor.new("blah")
|
12
|
+
assert_equal "fuck you: blah", h.test
|
13
|
+
end
|
14
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: autotag
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Matt Frisbie
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-13 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Autotag content gem
|
15
|
+
email: msfrisbie@gmail.com
|
16
|
+
executables:
|
17
|
+
- autotag
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- README
|
22
|
+
- Rakefile
|
23
|
+
- autotag-0.0.1.gem
|
24
|
+
- autotag.gemspec
|
25
|
+
- bin/autotag
|
26
|
+
- lib/autotag.rb
|
27
|
+
- lib/autotag/extractor.rb
|
28
|
+
- lib/autotag/extractor/document.rb
|
29
|
+
- lib/autotag/extractor/document/histogram.rb
|
30
|
+
- lib/autotag/extractor/document/stem.rb
|
31
|
+
- lib/autotag/extractor/document/term.rb
|
32
|
+
- lib/autotag/extractor/document/textblock.rb
|
33
|
+
- lib/autotag/tagger.rb
|
34
|
+
- test/test_autotag.rb
|
35
|
+
homepage: http://rubygems.org/gems/autotag
|
36
|
+
licenses: []
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
require_paths:
|
40
|
+
- lib
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubyforge_project: autotag
|
55
|
+
rubygems_version: 1.8.24
|
56
|
+
signing_key:
|
57
|
+
specification_version: 3
|
58
|
+
summary: autotag
|
59
|
+
test_files: []
|