RubyGems - autotag - Versions diffs - 1.0.0 - Mend

autotag 1.0.0

Files changed (15) hide show

data/README +0 -0
data/Rakefile +8 -0
data/autotag-0.0.1.gem +0 -0
data/autotag.gemspec +34 -0
data/bin/autotag +6 -0
data/lib/autotag.rb +22 -0
data/lib/autotag/extractor.rb +53 -0
data/lib/autotag/extractor/document.rb +69 -0
data/lib/autotag/extractor/document/histogram.rb +26 -0
data/lib/autotag/extractor/document/stem.rb +38 -0
data/lib/autotag/extractor/document/term.rb +18 -0
data/lib/autotag/extractor/document/textblock.rb +37 -0
data/lib/autotag/tagger.rb +0 -0
data/test/test_autotag.rb +14 -0
metadata +59 -0

data/README ADDED

File without changes

data/Rakefile ADDED

@@ -0,0 +1,8 @@
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+end
+desc "Run tests"
+task :default => :test

data/autotag-0.0.1.gem ADDED

Binary file

data/autotag.gemspec ADDED

@@ -0,0 +1,34 @@
+Gem::Specification.new do |s|
+  s.name               = "autotag"
+  s.version            = "1.0.0"
+  s.default_executable = "autotag"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Matt Frisbie"]
+  s.date = %q{2013-01-13}
+  s.email = %q{msfrisbie@gmail.com}
+  #s.files = ["Rakefile", "lib/autotag.rb", "lib/autotag/extractor.rb", "bin/autotag"]
+  #s.test_files = ["test/test_autotag.rb"]
+  s.homepage = %q{http://rubygems.org/gems/autotag}
+  s.rubygems_version = %q{1.6.2}
+  s.summary = %q{autotag}
+  s.description = %q{Autotag content gem}
+  s.rubyforge_project = "autotag"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/bin/autotag ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+# still need to package this as a binary executable
+require 'autotag'
+#puts Hola.hi(ARGV[0])

data/lib/autotag.rb ADDED

@@ -0,0 +1,22 @@
+# -*- encoding: utf-8 -*-
+require 'ai4r'
+require 'redis'
+require 'nokogiri'
+require 'lingua/stemmer'
+require 'open-uri'
+require 'autotag/extractor'
+require 'autotag/extractor/document'
+require 'autotag/extractor/document/histogram'
+require 'autotag/extractor/document/stem'
+require 'autotag/extractor/document/term'
+require 'autotag/extractor/document/textblock'
+#require
+require 'autotag/tagger'
+# require 'autotag/tagger/'
+module Autotag
+  #def self.test(param)
+end

data/lib/autotag/extractor.rb ADDED

@@ -0,0 +1,53 @@
+# -*- encoding: utf-8 -*-
+module Autotag
+  # class << self
+  #   # pass a url or document to extract text
+  #   def Extractor(document,options)
+  #   end
+  # end
+  module Extractor
+  # class Extractor
+    # REGEX CONSTANTS
+    # CONTRACTION
+    # selects all instances of english contractions
+    # could perhaps be reduced using special character wildcard,
+    # but then need to take into account things like 'R&D' being matched
+    # CNTR_REGEX = /(n’t)|(n't)|(’ll)|('ll)|(’ve)|('ve)|(’re)|('re)|(’s)|('s)|(’d)|('d)/
+    # TOTAL WHITESPACE
+    # selects strings that are composed entirely of whitespace
+    # TW_REGEX = /^\s*$/
+    # SPACE DELIMITED WORDS
+    # selects words of non-whitespace characters delimited by whitespace characters
+    # SDW_REGEX = /[\w-]+/
+    # NON-LETTER REGEX
+    # selects all characters that are not upcase/downcase letters
+    # NL_REGEX = /[^A-Za-z]/
+    # GLOBAL WHITESPACE REGEX
+    # selects all whitespace characters, including non-breaking space
+    # characters imported from an HTML -> latin1 conversion
+    # GW_REGEX = /\p{Z}/
+    # BLOCK DELIMITING WHITESPACE REGED
+    # selects continuous blocks of whitespace characters
+    # BDW_REGEX = (/\p{Z}+/)
+    # def initialize(str = "")
+    #   @str = str
+    # end
+    # def test
+    #   p "fuck you: #{@str}"
+    #   p "test  . str".gsub(GW_REGEX,'DERP')
+    # end
+    #def split_html
+  end
+end

data/lib/autotag/extractor/document.rb ADDED

@@ -0,0 +1,69 @@
+# -*- encoding: utf-8 -*-
+class Autotag::Extractor::Document
+  attr_reader :url, :stems, :textblocks, :histogram
+  def initialize(url)
+    @stems = {}
+    @url = url
+    #@text =
+    @textblocks = split_html(Nokogiri::HTML(open(url)))
+    #@histogram = Autotag::Extractor::Histogram.new
+    s = Lingua::Stemmer.new(:language => "en")
+    @textblocks.each_with_index do |f,blockindex|
+      #f.stemwords.each_with_index do |g,wordindex|
+      f.words.each_with_index do |g,wordindex|
+        self.stem(g,[blockindex,wordindex],s)
+      end
+    end
+    #Nokogiri::Extractor::Histogram.new
+    #@terms =
+    #@textblocks = []
+  end
+  # def histogram
+  #   @histogram
+  # end
+  def stem(term,coordinates,stemmer)
+    stem = stemmer.stem(term)
+    s = @stems[stem]
+    if !s.nil?#present?
+      s.add_term(term,coordinates)
+    else
+      @stems[stem] = Autotag::Extractor::Stem.new(stem,term,coordinates)
+    end
+  end
+  def split_html(node,charsize=0,wordsize=0)
+    arr = []
+    subset = node.children.remove
+    charsize += node.to_html.gsub("\n",'').size
+    subset.each do |f|
+      if f.class==Nokogiri::XML::Text
+        # pull out line breaks and tabs from text
+        # wordsize+=f.content.gsub("\n","").gsub("\t","").scan(/[\w-]+/).size
+        wordsize+=f.content.scan(/[\w-]+/).size
+      end
+    end
+    subset.each do |f|
+      if f.class==Nokogiri::XML::Text
+        if (f.content=~/^\s*$/).nil?
+          # this can be accomplished more efficiently
+          #arr << [f.content.gsub("\n","").gsub("\t",""),charsize,wordsize,wordsize.to_f/charsize.to_f]
+          #arr << [f.content.gsub(/(n’t)|(n't)|(’ll)|('ll)|(’ve)|('ve)|(’re)|('re)|(’s)|('s)|(’d)|('d)/,'' ), charsize,wordsize,wordsize.to_f/charsize.to_f]
+          arr << Autotag::Extractor::Textblock.new(f.content,charsize,wordsize)
+        end
+      else
+        arr += split_html(f,charsize,wordsize)
+      end
+    end
+    arr
+  end
+end

data/lib/autotag/extractor/document/histogram.rb ADDED

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+class Autotag::Extractor::Histogram
+  # generate term histogram,
+  # stem histogram, both accessible by methods
+  def initialize()
+    @stems = {}
+    @terms = {}
+  end
+  def stem(stem)
+    if @stems[stem]
+      @stems[stem] += 1
+    else
+      @stems[stem] = 1
+    end
+  end
+  def term(term)
+    if @terms[term]
+      @terms[term] += 1
+    else
+      @terms[term] = 1
+    end
+  end
+end

data/lib/autotag/extractor/document/stem.rb ADDED

@@ -0,0 +1,38 @@
+# -*- encoding: utf-8 -*-
+# this class will contain an array of child words,
+# each indexed into their location in the parent document
+class Autotag::Extractor::Stem
+  attr_reader :stem, :terms
+  def initialize(stem,term=nil,coordinates=[])
+    @stem = stem
+    @terms = []
+    if !term.nil? && coordinates.any?
+      self.add_term(term,coordinates)
+    else
+      p "error! #{term} #{coordinates}"
+    end
+  end
+  def size
+    @terms.size
+  end
+  def add_term(term,coordinates)
+    if (t=find_term(term)).nil?
+      @terms << Autotag::Extractor::Term.new(term,coordinates)
+    else
+      t.merge(coordinates)
+    end
+  end
+  def find_term(term)
+    @terms.each do |f|
+      return f if term == f.to_s
+    end
+    nil
+  end
+end

data/lib/autotag/extractor/document/term.rb ADDED

@@ -0,0 +1,18 @@
+# -*- encoding: utf-8 -*-
+class Autotag::Extractor::Term
+	attr_reader :term, :coordinates
+	# define location in parent documents
+	# define location in histogram?
+	def initialize(term,coordinates)
+		@term = term
+		@coordinates = [coordinates]
+	end
+	def to_s
+		return @term
+	end
+	def merge(coordinates)
+		@coordinates << coordinates
+	end
+end

data/lib/autotag/extractor/document/textblock.rb ADDED

@@ -0,0 +1,37 @@
+# -*- encoding: utf-8 -*-
+class Autotag::Extractor::Textblock
+  attr_reader :size, :words
+  # size, HTML data
+  def initialize(str,charsize,wordsize)
+    # count the number of blocks of non-whitespace characters
+    @charsize = charsize
+    @wordsize = wordsize
+    @words = str.split(/\p{Z}+/).reject{|f| f.empty?}
+    @size = @words.size
+  end
+  def ratio
+    return @wordsize.to_f/@charsize.to_f
+  end
+  # def stemwords
+  def words
+    # s = Lingua::Stemmer.new(:language => "en")
+    #@words.map{|f| [s.stem(f.gsub(/[^A-Za-z0-9]/,'')),f]}
+    @words.map{|f| f.gsub(/[^A-Za-z0-9]/,'')}
+  end
+  def plaintext
+    @words.join(' ')
+  end
+  def [] (index)
+    @words[index]
+  end
+  # returns number of words in the block
+  # def size
+  #   @size
+  # end
+end

data/lib/autotag/tagger.rb ADDED

File without changes

data/test/test_autotag.rb ADDED

@@ -0,0 +1,14 @@
+require 'test/unit'
+require 'autotag'
+class AutotagTest < Test::Unit::TestCase
+  # def test_empty
+  # 	h = Autotag::Extractor.new#("")
+  #   assert_equal "fuck you: empty", h.test
+  # end
+  def test_string
+  	h = Autotag::Extractor.new("blah")
+    assert_equal "fuck you: blah", h.test
+  end
+end

metadata ADDED

@@ -0,0 +1,59 @@
+--- !ruby/object:Gem::Specification
+name: autotag
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+  prerelease:
+platform: ruby
+authors:
+- Matt Frisbie
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-01-13 00:00:00.000000000 Z
+dependencies: []
+description: Autotag content gem
+email: msfrisbie@gmail.com
+executables:
+- autotag
+extensions: []
+extra_rdoc_files: []
+files:
+- README
+- Rakefile
+- autotag-0.0.1.gem
+- autotag.gemspec
+- bin/autotag
+- lib/autotag.rb
+- lib/autotag/extractor.rb
+- lib/autotag/extractor/document.rb
+- lib/autotag/extractor/document/histogram.rb
+- lib/autotag/extractor/document/stem.rb
+- lib/autotag/extractor/document/term.rb
+- lib/autotag/extractor/document/textblock.rb
+- lib/autotag/tagger.rb
+- test/test_autotag.rb
+homepage: http://rubygems.org/gems/autotag
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: autotag
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: autotag
+test_files: []