RubyGems - moro-delta_attack - Versions diffs - 0.1.0 - Mend

moro-delta_attack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/ChangeLog +4 -0
data/NOTICE +5 -0
data/README +50 -0
data/Rakefile +139 -0
data/bin/delta_attack_server +66 -0
data/lib/delta_attack/client.rb +57 -0
data/lib/delta_attack/extractor/base.rb +23 -0
data/lib/delta_attack/extractor/excel.rb +41 -0
data/lib/delta_attack/extractor/power_point.rb +20 -0
data/lib/delta_attack/extractor/servlet.rb +31 -0
data/lib/delta_attack/extractor/word.rb +25 -0
data/lib/delta_attack/extractor.rb +20 -0
data/lib/delta_attack/filetype_assumption.rb +46 -0
data/lib/delta_attack.rb +5 -0
data/lib/vendor/README +8 -0
data/spec/extractor/excel_spec.rb +23 -0
data/spec/extractor/power_point_spec.rb +23 -0
data/spec/extractor/word_spec.rb +24 -0
data/spec/filetype_assumption_spec.rb +51 -0
data/spec/spec_helper.rb +13 -0
metadata +88 -0

data/ChangeLog ADDED Viewed

@@ -0,0 +1,4 @@
+== 0.0.1 / 2008-09-12
+* initial release

data/NOTICE ADDED Viewed

@@ -0,0 +1,5 @@
+This library depends on Apache POI libraries,
+which is provided as Apache Licence version 2.
+http://poi.apache.org/

data/README ADDED Viewed

@@ -0,0 +1,50 @@
+= delta_attack
+== Description
+Extract MS Office files to plain text.
+== Installation
+=== Archive Installation
+ $ rake install
+=== Gem Installation
+ $ gem source -a http://gems.github.com
+ $ gem install moro-delta-attack
+== Features/Problems
+Extract MS Office files to plain text usin Apache POI and JRuby.
+It works with Client/Server architecture.
+The extract server is works on JRuby but the client is works with
+both cRuby and JRuby.
+This library originally aim to index Office documents to fulltext
+serach engine.
+== Synopsis
+first you start DeltaAttackServer, which needs JRuby and Apache POI
+ $ export CLASSPATH=path/to/poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar:\
+                    path/to/poi-3.1-FINAL/poi-scratchpad-3.1-FINAL-20080629.jar
+ $ jruby bin/delta_attack_server
+Then you can use DeltaAttack::Client, in both CRuby(MRI) and JRuby.
+ require 'delta_attack/client'
+ DeletaAttack::Client.cast("path/to/some.xls")
+== Copyright
+Author::    moro <moronatural@gmail.com>
+Copyright:: Copyright (c) 2008 moro
+License::   MIT

data/Rakefile ADDED Viewed

@@ -0,0 +1,139 @@
+require 'rubygems'
+require 'rake'
+require 'rake/clean'
+require 'rake/testtask'
+require 'rake/packagetask'
+require 'rake/gempackagetask'
+require 'rake/rdoctask'
+require 'rake/contrib/rubyforgepublisher'
+require 'rake/contrib/sshpublisher'
+require 'lib/delta_attack'
+require 'spec/rake/spectask'
+require 'fileutils'
+include FileUtils
+NAME              = "delta_attack"
+AUTHOR            = "MOROHASHI Kyosuke"
+EMAIL             = "moronatural@gmail.com"
+DESCRIPTION       = "extract text from MS Office document with Apache POI"
+# RUBYFORGE_PROJECT = "delta_attack"
+HOMEPATH          = "http://github.com/moro/delta_attack"
+BIN_FILES         = %w( delta_attack_server )
+VERS              = DeltaAttack::VERSION
+REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
+CLEAN.include ['**/.*.sw?', '*.gem', '.config']
+RDOC_OPTS = [
+	'--title', "#{NAME} documentation",
+	"--charset", "utf-8",
+	"--opname", "index.html",
+	"--line-numbers",
+	"--main", "README",
+	"--inline-source",
+]
+task :default => [:spec]
+task :package => [:clean]
+Spec::Rake::SpecTask.new("spec") do |t|
+	t.libs   << "spec"
+	t.pattern = "spec/**/*_spec.rb"
+	t.verbose = true
+end
+spec = Gem::Specification.new do |s|
+	s.name              = NAME
+	s.version           = VERS
+	s.platform          = Gem::Platform::RUBY
+	s.has_rdoc          = true
+	s.extra_rdoc_files  = ["README", "ChangeLog"]
+	s.rdoc_options     += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
+	s.summary           = DESCRIPTION
+	s.description       = DESCRIPTION
+	s.author            = AUTHOR
+	s.email             = EMAIL
+	s.homepage          = HOMEPATH
+	s.executables       = BIN_FILES
+#	s.rubyforge_project = RUBYFORGE_PROJECT
+	s.bindir            = "bin"
+	s.require_path      = "lib"
+	s.test_files        = Dir["spec/*_test.rb"]
+	#s.add_dependency('activesupport', '>=1.3.1')
+	#s.required_ruby_version = '>= 1.8.2'
+	s.files = %w(README NOTICE ChangeLog Rakefile) +
+		Dir.glob("{bin,doc,spec,lib,templates,generator,extras,website,script}/**/*") +
+		#Dir.glob("ext/**/*.{h,c,rb}") +
+		#Dir.glob("examples/**/*.rb") +
+		Dir.glob("tools/*.rb")
+	s.extensions = FileList["ext/**/extconf.rb"].to_a
+end
+Rake::GemPackageTask.new(spec) do |p|
+	p.need_tar = true
+	p.gem_spec = spec
+end
+task :debug_gem do |p|
+  puts spec.to_ruby
+end
+task :install do
+	name = "#{NAME}-#{VERS}.gem"
+	sh %{rake package}
+	sh %{sudo gem install pkg/#{name}}
+end
+task :uninstall => [:clean] do
+	sh %{sudo gem uninstall #{NAME}}
+end
+Rake::RDocTask.new do |rdoc|
+	rdoc.rdoc_dir = 'html'
+	rdoc.options += RDOC_OPTS
+	rdoc.template = "resh"
+	#rdoc.template = "#{ENV['template']}.rb" if ENV['template']
+	if ENV['DOC_FILES']
+		rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
+	else
+		rdoc.rdoc_files.include('README', 'ChangeLog')
+		rdoc.rdoc_files.include('lib/**/*.rb')
+		rdoc.rdoc_files.include('ext/**/*.c')
+	end
+end
+=begin
+desc "Publish to RubyForge"
+task :rubyforge => [:rdoc, :package] do
+	require 'rubyforge'
+	Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'moro').upload
+end
+desc 'Package and upload the release to rubyforge.'
+task :release => [:clean, :package] do |t|
+	v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
+	abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
+	pkg = "pkg/#{NAME}-#{VERS}"
+	require 'rubyforge'
+	rf = RubyForge.new
+	puts "Logging in"
+	rf.login
+	c = rf.userconfig
+#	c["release_notes"] = description if description
+#	c["release_changes"] = changes if changes
+	c["preformatted"] = true
+	files = [
+		"#{pkg}.tgz",
+		"#{pkg}.gem"
+	].compact
+	puts "Releasing #{NAME} v. #{VERS}"
+	rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
+end
+=end

data/bin/delta_attack_server ADDED Viewed

@@ -0,0 +1,66 @@
+#!/usr/bin/env jruby
+# vim:set fileencoding=utf-8 filetype=ruby
+$KCODE = 'u'
+require "optparse"
+require "rbconfig"
+require "delta_attack"
+module DeltaAttack
+  class Server
+    DEFAULT_OPTION = {
+      :port => 3333,
+      :mount => "/extract",
+    }.freeze
+    def self.run(argv)
+      if RbConfig::CONFIG["arch"] =~ /java/i
+        new(argv.dup).run
+      else
+        exec(*["jruby", $0, argv])
+      end
+    end
+    def initialize(argv)
+      @argv = argv
+      @options = DEFAULT_OPTION.dup
+      @parser = OptionParser.new do |parser|
+        parser.banner = <<-EOB.gsub(/^\t+/, "")
+          Usage: #$0 [options]
+        EOB
+        parser.separator "Options:"
+        parser.on("-p", "--port=PORT", Integer, "specify port default: #{DEFAULT_OPTION[:port]}") do |v|
+          @options[:port] = v
+        end
+        parser.on("-m", "--mount=PATH", String, "mount path of extract servlet #{DEFAULT_OPTION[:mount].dump}") do |v|
+          @options[:mount] = v
+        end
+        parser.separator ""
+        parser.on("--version", "Show version string `#{VERSION}'") do
+          puts VERSION
+          exit
+        end
+      end
+    end
+    def run
+		  @parser.order!(@argv)
+      require 'webrick/httpserver'
+      require 'delta_attack/extractor'
+      require 'delta_attack/extractor/servlet'
+      @server = WEBrick::HTTPServer.new(:Port=>@options[:port])
+      @server.mount(@options[:mount], DeltaAttack::Extractor::Servlet)
+      trap("INT"){ @server.shutdown }
+      @server.start
+    end
+  end
+end
+DeltaAttack::Server.run(ARGV)

data/lib/delta_attack/client.rb ADDED Viewed

@@ -0,0 +1,57 @@
+require 'net/http'
+require 'delta_attack/filetype_assumption'
+require 'securerandom'
+module DeltaAttack
+  class Client
+    class << self
+      def cast(file, host="localhost", port=3333)
+        begin
+          req = new(file).request
+          res = Net::HTTP.start(host, port){|http| http.request(req) }
+          raise "Request failed #{res}" unless res.is_a? Net::HTTPOK
+          res.body
+        rescue Errno::ECONNREFUSED => e
+          raise "DeltaAttack Server is down on http://#{host}:#{port}"
+        end
+      end
+      alias extract cast
+    end
+    def initialize(filename, content=nil)
+      @filename = filename
+      @content = content
+    end
+    def boundary
+      @boundary ||= SecureRandom.hex(8)
+    end
+    def content
+      @content ||= File.open(@filename,"rb"){|f| f.read }
+    end
+    def content_type
+      @content_type ||= FiletypeAssumption.new(File.basename(@filename)).content_type
+    end
+    def body
+      data = ''
+      data << "--#{boundary}\r\n"
+      data << "Content-Disposition: form-data; name=\"file\"; filename=\"#{@filename}\"\r\n"
+      data << "Content-Type: #{content_type}\r\n\r\n"
+      data << content
+      data << "\r\n--#{boundary}--\r\n"
+    end
+    def request(path = "/extract" )
+      req = Net::HTTP::Post.new(path)
+      req.content_type = "multipart/form-data; boundary=#{boundary}"
+      req.body = body
+      req.content_length = req.body.size
+      req
+    end
+  end
+end

data/lib/delta_attack/extractor/base.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require 'java'
+module DeltaAttack
+  module Extractor
+    class Base
+      attr_accessor :bytes
+      def initialize(bytes)
+        @bytes = bytes
+      end
+      def data(ignore_cache=false)
+        return @data if (!ignore_cache) && @data
+        @data = extract_data
+      end
+      private
+      def java_input_stream
+        Java::JavaIo::ByteArrayInputStream.new(@bytes)
+      end
+    end
+  end
+end

data/lib/delta_attack/extractor/excel.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'delta_attack/extractor/base'
+include_class 'org.apache.poi.hssf.usermodel.HSSFWorkbook'
+include_class 'org.apache.poi.hssf.usermodel.HSSFCell'
+module DeltaAttack
+  module Extractor
+    class Excel < Base
+      private
+      def extract_data
+        input_stream = java_input_stream
+        begin
+          book = HSSFWorkbook.new(input_stream)
+          return (0...book.number_of_sheets).map do |i|
+            extract_sheet(book.sheet_at(i))
+          end
+        ensure
+          input_stream.close
+        end
+      end
+      def extract_sheet(sheet)
+        sheet.iterator.map do |row|
+          row.iterator.map{|cell| handle_cell(cell) }
+        end
+      end
+      def handle_cell(cell)
+        case cell.cell_type
+        when HSSFCell::CELL_TYPE_NUMERIC
+          cell.numeric_cell_value
+        when HSSFCell::CELL_TYPE_STRING
+          cell.rich_string_cell_value.string
+        when HSSFCell::CELL_TYPE_BOOLEAN, HSSFCell::CELL_TYPE_BLANK
+          nil
+        end
+      end
+    end
+  end
+end

data/lib/delta_attack/extractor/power_point.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'delta_attack/extractor/base'
+include_class 'org.apache.poi.hslf.usermodel.SlideShow'
+module DeltaAttack
+  module Extractor
+    class PowerPoint < Base
+      private
+      def extract_data
+        input_stream = java_input_stream
+        begin
+          slide_show = SlideShow.new(input_stream)
+          slide_show.slides.map do |slide|
+            slide.text_runs.map{|tr| tr.text }
+          end
+        end
+      end
+    end
+  end
+end

data/lib/delta_attack/extractor/servlet.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'webrick/httpservlet'
+require 'delta_attack/extractor'
+require 'delta_attack/filetype_assumption'
+module DeltaAttack
+  module Extractor
+    class Servlet < WEBrick::HTTPServlet::AbstractServlet
+      def do_GET(req, res)
+        res.body = <<-HTML
+<html>
+  <head></head>
+  <body>
+    <form action="/extract" enctype="multipart/form-data" method="post">
+      <input type="file" name="file" />
+      <input type="submit" name="submit" value="up" />
+    </form>
+  </body>
+</html>
+        HTML
+      end
+      def do_POST(req, res)
+        f = req.query["file"]
+        type = FiletypeAssumption.new(f.filename, f['content-type'])
+        res.body = Extractor.extract(f.to_s, type.filetype)
+        res.content_type = "text/plain"
+      end
+    end
+  end
+end

data/lib/delta_attack/extractor/word.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'delta_attack/extractor/base'
+include_class 'org.apache.poi.hwpf.HWPFDocument'
+module DeltaAttack
+  module Extractor
+    class Word < Base
+      private
+      def extract_data
+        input_stream = java_input_stream
+        begin
+          book = HWPFDocument.new(input_stream)
+          range = book.range
+          (0...range.num_paragraphs).map do |i|
+            range.paragraph(i).text.strip
+          end
+        ensure
+          input_stream.close
+        end
+      end
+    end
+  end
+end

data/lib/delta_attack/extractor.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'delta_attack/extractor/base'
+require 'delta_attack/extractor/word'
+require 'delta_attack/extractor/excel'
+require 'delta_attack/extractor/power_point'
+module DeltaAttack
+  module Extractor
+    def extract(content,type)
+      extractor = case type
+                  when :word then Word
+                  when :excel then Excel
+                  when :power_point then PowerPoint
+                  else return "not supported"
+                  end
+      extractor.new(content.to_java_bytes).data.flatten.join("\n")
+    end
+    module_function :extract
+  end
+end

data/lib/delta_attack/filetype_assumption.rb ADDED Viewed

@@ -0,0 +1,46 @@
+begin
+  require 'mahoro'
+rescue LoadError
+  nil
+end
+module DeltaAttack
+  class FiletypeAssumption
+    CONTENT_TYPES = {
+      "application/msword" => :word,
+      "application/vnd.ms-excel" => :excel,
+      "application/vnd.ms-powerpoint" => :power_point,
+    }.freeze
+    def self.support_magic?
+      defined? Mahoro
+    end
+    def initialize(filename, content_type = nil, content = nil)
+      @filename = filename
+      @content_type = content_type
+      @content = content
+    end
+    def filetype
+      by_content_type || by_extention || :unknown
+    end
+    def content_type
+      CONTENT_TYPES.index(filetype)
+    end
+    private
+    def by_content_type
+      CONTENT_TYPES[@content_type]
+    end
+    def by_extention
+      case File.extname(@filename).downcase
+      when ".doc" then :word
+      when ".xls" then :excel
+      when ".ppt" then :power_point
+      end
+    end
+  end
+end

data/lib/delta_attack.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module DeltaAttack
+  VERSION = "0.1.0"
+end

data/lib/vendor/README ADDED Viewed

@@ -0,0 +1,8 @@
+Download POI from <http://poi.apache.org/> version =< 3.1
+and symlink here poi-current.jar
+example,
+ $ ls -l lib/vendor/
+ lrwxr-xr-x  1 you  us   40 Sep 12 09:54 poi-current.jar -> poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar

data/spec/extractor/excel_spec.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require File.expand_path("../spec_helper", File.dirname(__FILE__))
+require 'delta_attack/extractor/excel'
+require 'java'
+require 'timeout'
+describe DeltaAttack::Extractor::Excel do
+  include SpecHelper
+  before do
+    content = File.read(sample_data("13TOKYO.xls"))
+    @xls = DeltaAttack::Extractor::Excel.new(content.to_java_bytes)
+  end
+  it { @xls.bytes.should_not be_nil }
+  it "data[0][0].should == 13101" do
+    @xls.data[0][0][0].should == 13101
+  end
+  it "2nd call of data() should be cached" do
+    @xls.data # 1st.
+    lambda{ timeout(0.1){ @xls.data } }.should_not raise_error(Timeout::Error)
+  end
+end

data/spec/extractor/power_point_spec.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require File.expand_path("../spec_helper", File.dirname(__FILE__))
+require 'delta_attack/extractor/power_point'
+require 'java'
+require 'timeout'
+describe DeltaAttack::Extractor::PowerPoint do
+  include SpecHelper
+  before do
+    content = File.read(sample_data("named_scope06.ppt"))
+    @ppt = DeltaAttack::Extractor::PowerPoint.new(content.to_java_bytes)
+  end
+  it { @ppt.bytes.should_not be_nil }
+  it "data.flatten.first.should == /named_scope/" do
+    @ppt.data.flatten.first.should =~ /named_scope/
+  end
+  it "2nd call of data() should be cached" do
+    @ppt.data # 1st.
+    lambda{ timeout(0.1){ @ppt.data } }.should_not raise_error(Timeout::Error)
+  end
+end

data/spec/extractor/word_spec.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require File.expand_path("../spec_helper", File.dirname(__FILE__))
+require 'delta_attack/extractor/word'
+require 'java'
+require 'timeout'
+$KCODE = "u"
+describe DeltaAttack::Extractor::Word do
+  include SpecHelper
+  before do
+    content = File.read(sample_data("myblog.doc"))
+    @doc = DeltaAttack::Extractor::Word.new(content.to_java_bytes)
+  end
+  it { @doc.bytes.should_not be_nil }
+  it "data.flatten.first.should =~ /WEBrick/" do
+    @doc.data.flatten.first.should =~ /WEBrick/
+  end
+  it "2nd call of data() should be cached" do
+    @doc.data # 1st.
+    lambda{ timeout(0.1){ @doc.data } }.should_not raise_error(Timeout::Error)
+  end
+end

data/spec/filetype_assumption_spec.rb ADDED Viewed

@@ -0,0 +1,51 @@
+require File.expand_path("spec_helper", File.dirname(__FILE__))
+require 'delta_attack/filetype_assumption'
+describe DeltaAttack::FiletypeAssumption do
+  include SpecHelper
+  it "should not support_magic" do
+    DeltaAttack::FiletypeAssumption.should_not be_support_magic
+  end
+  describe "new('hoge.xls')" do
+    before do
+      @asm = DeltaAttack::FiletypeAssumption.new('hoge.xls')
+    end
+    it "filetype.should == :excel" do
+      @asm.filetype.should == :excel
+    end
+  end
+  describe "new('hoge.dat', 'application/vnd.ms-excel')" do
+    before do
+      @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/vnd.ms-excel')
+    end
+    it "filetype.should == :excel" do
+      @asm.filetype.should == :excel
+    end
+  end
+  describe "new('hoge.dat', 'application/octet-stream')" do
+    before do
+      @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream')
+    end
+    it "filetype.should == :unknown" do
+      @asm.filetype.should == :unknown
+    end
+  end
+  describe "new('hoge.dat', 'application/octet-stream', <content>)" do
+    before do
+      content = File.read(sample_data("13TOKYO.xls"))
+      @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream', content)
+    end
+    it "filetype.should == :excel" do
+      pending "mahoro is not installed" unless DeltaAttack::FiletypeAssumption.support_magic?
+      @asm.filetype.should == :excel
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+# vim:set fileencoding=utf-8 filetype=ruby
+$KCODE = 'u'
+require 'rubygems'
+$:.unshift(File.expand_path("../lib", File.dirname(__FILE__)))
+module SpecHelper
+  def sample_data(name)
+    File.expand_path("../samples/data/" + name, File.dirname(__FILE__))
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,88 @@
+--- !ruby/object:Gem::Specification
+name: moro-delta_attack
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- MOROHASHI Kyosuke
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-09-22 00:00:00 -07:00
+default_executable: delta_attack_server
+dependencies: []
+description: extract text from MS Office document with Apache POI
+email: moronatural@gmail.com
+executables:
+- delta_attack_server
+extensions: []
+extra_rdoc_files:
+- README
+- ChangeLog
+files:
+- README
+- NOTICE
+- ChangeLog
+- Rakefile
+- bin/delta_attack_server
+- spec/extractor
+- spec/extractor/excel_spec.rb
+- spec/extractor/power_point_spec.rb
+- spec/extractor/word_spec.rb
+- spec/filetype_assumption_spec.rb
+- spec/spec_helper.rb
+- lib/delta_attack
+- lib/delta_attack/client.rb
+- lib/delta_attack/extractor
+- lib/delta_attack/extractor/base.rb
+- lib/delta_attack/extractor/excel.rb
+- lib/delta_attack/extractor/power_point.rb
+- lib/delta_attack/extractor/servlet.rb
+- lib/delta_attack/extractor/word.rb
+- lib/delta_attack/extractor.rb
+- lib/delta_attack/filetype_assumption.rb
+- lib/delta_attack.rb
+- lib/vendor
+- lib/vendor/README
+has_rdoc: true
+homepage: http://github.com/moro/delta_attack
+post_install_message:
+rdoc_options:
+- --title
+- delta_attack documentation
+- --charset
+- utf-8
+- --opname
+- index.html
+- --line-numbers
+- --main
+- README
+- --inline-source
+- --exclude
+- ^(examples|extras)/
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: extract text from MS Office document with Apache POI
+test_files: []