delta_attack 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog ADDED
@@ -0,0 +1,4 @@
1
+ == 0.0.1 / 2008-09-12
2
+
3
+ * initial release
4
+
data/NOTICE ADDED
@@ -0,0 +1,5 @@
1
+ This library depends on Apache POI libraries,
2
+ which is provided as Apache Licence version 2.
3
+
4
+ http://poi.apache.org/
5
+
data/README ADDED
@@ -0,0 +1,50 @@
1
+
2
+ = delta_attack
3
+
4
+
5
+ == Description
6
+
7
+ Extract MS Office files to plain text.
8
+
9
+ == Installation
10
+
11
+
12
+ === Archive Installation
13
+
14
+ $ rake install
15
+
16
+ === Gem Installation
17
+
18
+ $ gem source -a http://gems.github.com
19
+ $ gem install moro-delta-attack
20
+
21
+ == Features/Problems
22
+
23
+ Extract MS Office files to plain text usin Apache POI and JRuby.
24
+ It works with Client/Server architecture.
25
+
26
+ The extract server is works on JRuby but the client is works with
27
+ both cRuby and JRuby.
28
+
29
+ This library originally aim to index Office documents to fulltext
30
+ serach engine.
31
+
32
+ == Synopsis
33
+
34
+ first you start DeltaAttackServer, which needs JRuby and Apache POI
35
+
36
+ $ export CLASSPATH=path/to/poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar:\
37
+ path/to/poi-3.1-FINAL/poi-scratchpad-3.1-FINAL-20080629.jar
38
+ $ jruby bin/delta_attack_server
39
+
40
+ Then you can use DeltaAttack::Client, in both CRuby(MRI) and JRuby.
41
+
42
+ require 'delta_attack/client'
43
+ DeletaAttack::Client.cast("path/to/some.xls")
44
+
45
+ == Copyright
46
+
47
+ Author:: moro <moronatural@gmail.com>
48
+ Copyright:: Copyright (c) 2008 moro
49
+ License:: MIT
50
+
data/Rakefile ADDED
@@ -0,0 +1,139 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/rdoctask'
8
+ require 'rake/contrib/rubyforgepublisher'
9
+ require 'rake/contrib/sshpublisher'
10
+ require 'lib/delta_attack'
11
+ require 'spec/rake/spectask'
12
+ require 'fileutils'
13
+ include FileUtils
14
+
15
+ NAME = "delta_attack"
16
+ AUTHOR = "MOROHASHI Kyosuke"
17
+ EMAIL = "moronatural@gmail.com"
18
+ DESCRIPTION = "extract text from MS Office document with Apache POI"
19
+ # RUBYFORGE_PROJECT = "delta_attack"
20
+ HOMEPATH = "http://github.com/moro/delta_attack"
21
+ BIN_FILES = %w( delta_attack_server )
22
+ VERS = DeltaAttack::VERSION
23
+
24
+
25
+ REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
26
+ CLEAN.include ['**/.*.sw?', '*.gem', '.config']
27
+ RDOC_OPTS = [
28
+ '--title', "#{NAME} documentation",
29
+ "--charset", "utf-8",
30
+ "--opname", "index.html",
31
+ "--line-numbers",
32
+ "--main", "README",
33
+ "--inline-source",
34
+ ]
35
+
36
+ task :default => [:spec]
37
+ task :package => [:clean]
38
+
39
+ Spec::Rake::SpecTask.new("spec") do |t|
40
+ t.libs << "spec"
41
+ t.pattern = "spec/**/*_spec.rb"
42
+ t.verbose = true
43
+ end
44
+
45
+ spec = Gem::Specification.new do |s|
46
+ s.name = NAME
47
+ s.version = VERS
48
+ s.platform = Gem::Platform::RUBY
49
+ s.has_rdoc = true
50
+ s.extra_rdoc_files = ["README", "ChangeLog"]
51
+ s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
52
+ s.summary = DESCRIPTION
53
+ s.description = DESCRIPTION
54
+ s.author = AUTHOR
55
+ s.email = EMAIL
56
+ s.homepage = HOMEPATH
57
+ s.executables = BIN_FILES
58
+ # s.rubyforge_project = RUBYFORGE_PROJECT
59
+ s.bindir = "bin"
60
+ s.require_path = "lib"
61
+ s.test_files = Dir["spec/*_test.rb"]
62
+
63
+ #s.add_dependency('activesupport', '>=1.3.1')
64
+ #s.required_ruby_version = '>= 1.8.2'
65
+
66
+ s.files = %w(README NOTICE ChangeLog Rakefile) +
67
+ Dir.glob("{bin,doc,spec,lib,templates,generator,extras,website,script}/**/*") +
68
+ Dir.glob("tools/*.rb") -
69
+ Dir.glob("lib/vendor/**/*") +
70
+ Dir.glob("lib/vendor/README")
71
+
72
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
73
+ end
74
+
75
+ Rake::GemPackageTask.new(spec) do |p|
76
+ p.need_tar = true
77
+ p.gem_spec = spec
78
+ end
79
+
80
+ task :debug_gem do |p|
81
+ puts spec.to_ruby
82
+ end
83
+
84
+ task :install do
85
+ name = "#{NAME}-#{VERS}.gem"
86
+ sh %{rake package}
87
+ sh %{sudo gem install pkg/#{name}}
88
+ end
89
+
90
+ task :uninstall => [:clean] do
91
+ sh %{sudo gem uninstall #{NAME}}
92
+ end
93
+
94
+
95
+ Rake::RDocTask.new do |rdoc|
96
+ rdoc.rdoc_dir = 'html'
97
+ rdoc.options += RDOC_OPTS
98
+ rdoc.template = "resh"
99
+ #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
100
+ if ENV['DOC_FILES']
101
+ rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
102
+ else
103
+ rdoc.rdoc_files.include('README', 'ChangeLog')
104
+ rdoc.rdoc_files.include('lib/**/*.rb')
105
+ rdoc.rdoc_files.include('ext/**/*.c')
106
+ end
107
+ end
108
+ =begin
109
+ desc "Publish to RubyForge"
110
+ task :rubyforge => [:rdoc, :package] do
111
+ require 'rubyforge'
112
+ Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'moro').upload
113
+ end
114
+
115
+ desc 'Package and upload the release to rubyforge.'
116
+ task :release => [:clean, :package] do |t|
117
+ v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
118
+ abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
119
+ pkg = "pkg/#{NAME}-#{VERS}"
120
+
121
+ require 'rubyforge'
122
+ rf = RubyForge.new
123
+ puts "Logging in"
124
+ rf.login
125
+
126
+ c = rf.userconfig
127
+ # c["release_notes"] = description if description
128
+ # c["release_changes"] = changes if changes
129
+ c["preformatted"] = true
130
+
131
+ files = [
132
+ "#{pkg}.tgz",
133
+ "#{pkg}.gem"
134
+ ].compact
135
+
136
+ puts "Releasing #{NAME} v. #{VERS}"
137
+ rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
138
+ end
139
+ =end
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env jruby
2
+ # vim:set fileencoding=utf-8 filetype=ruby
3
+ $KCODE = 'u'
4
+
5
+ require "optparse"
6
+ require "rbconfig"
7
+ require "delta_attack"
8
+
9
+ module DeltaAttack
10
+ class Server
11
+
12
+ DEFAULT_OPTION = {
13
+ :port => 3333,
14
+ :mount => "/extract",
15
+ }.freeze
16
+
17
+ def self.run(argv)
18
+ if RbConfig::CONFIG["arch"] =~ /java/i
19
+ new(argv.dup).run
20
+ else
21
+ exec(*["jruby", $0, *argv])
22
+ end
23
+ end
24
+
25
+ def initialize(argv)
26
+ @argv = argv
27
+ @options = DEFAULT_OPTION.dup
28
+
29
+ @parser = OptionParser.new do |parser|
30
+ parser.banner = <<-EOB.gsub(/^\t+/, "")
31
+ Usage: #$0 [options]
32
+ EOB
33
+
34
+ parser.separator "Options:"
35
+ parser.on("-p", "--port=PORT", Integer, "specify port default: #{DEFAULT_OPTION[:port]}") do |v|
36
+ @options[:port] = v
37
+ end
38
+ parser.on("-m", "--mount=PATH", String, "mount path of extract servlet #{DEFAULT_OPTION[:mount].dump}") do |v|
39
+ @options[:mount] = v
40
+ end
41
+
42
+ parser.separator ""
43
+
44
+ parser.on("--version", "Show version string `#{VERSION}'") do
45
+ puts VERSION
46
+ exit
47
+ end
48
+ end
49
+ end
50
+
51
+ def run
52
+ @parser.order!(@argv)
53
+ require 'webrick/httpserver'
54
+ require 'delta_attack/extractor'
55
+ require 'delta_attack/extractor/servlet'
56
+
57
+ @server = WEBrick::HTTPServer.new(:Port=>@options[:port])
58
+ @server.mount(@options[:mount], DeltaAttack::Extractor::Servlet)
59
+ trap("INT"){ @server.shutdown }
60
+ @server.start
61
+ end
62
+ end
63
+ end
64
+
65
+ DeltaAttack::Server.run(ARGV)
66
+
@@ -0,0 +1,5 @@
1
+
2
+ module DeltaAttack
3
+ VERSION = "0.1.3"
4
+ end
5
+
@@ -0,0 +1,65 @@
1
+
2
+ require 'net/http'
3
+ require 'delta_attack/filetype_assumption'
4
+ require 'securerandom'
5
+
6
+ module DeltaAttack
7
+ class Client
8
+ class << self
9
+ def cast(filename, content_type = nil, host="localhost", port=3333)
10
+ cast_buf(nil, filename, content_type, host, port)
11
+ end
12
+ alias extract cast
13
+
14
+ def cast_buf(content, filename = "no-filename", content_type = nil, host="localhost", port=3333)
15
+ begin
16
+ client = new(filename, content)
17
+ client.content_type = content_type
18
+ res = Net::HTTP.start(host, port){|http| http.request(client.request) }
19
+ raise "Request failed #{res}" unless res.is_a? Net::HTTPOK
20
+ res.body
21
+ rescue Errno::ECONNREFUSED => e
22
+ raise "DeltaAttack Server is down on http://#{host}:#{port}"
23
+ end
24
+ end
25
+ alias extract_buf cast_buf
26
+ end
27
+
28
+ attr_writer :content_type
29
+
30
+ def initialize(filename, content=nil)
31
+ @filename = filename
32
+ @content = content
33
+ end
34
+
35
+ def boundary
36
+ @boundary ||= Digest::SHA1.hexdigest(File.read(__FILE__))[0,8]
37
+ end
38
+
39
+ def content
40
+ @content ||= File.open(@filename,"rb"){|f| f.read }
41
+ end
42
+
43
+ def content_type
44
+ @content_type ||= FiletypeAssumption.new(File.basename(@filename)).content_type
45
+ end
46
+
47
+ def body
48
+ data = ''
49
+ data << "--#{boundary}\r\n"
50
+ data << "Content-Disposition: form-data; name=\"file\"; filename=\"#{@filename}\"\r\n"
51
+ data << "Content-Type: #{content_type}\r\n\r\n"
52
+ data << content
53
+ data << "\r\n--#{boundary}--\r\n"
54
+ end
55
+
56
+ def request(path = "/extract" )
57
+ req = Net::HTTP::Post.new(path)
58
+ req.content_type = "multipart/form-data; boundary=#{boundary}"
59
+ req.body = body
60
+ req.content_length = req.body.size
61
+ req
62
+ end
63
+ end
64
+ end
65
+
@@ -0,0 +1,21 @@
1
+ require 'delta_attack/extractor/base'
2
+ require 'delta_attack/extractor/word'
3
+ require 'delta_attack/extractor/excel'
4
+ require 'delta_attack/extractor/power_point'
5
+
6
+ module DeltaAttack
7
+ module Extractor
8
+ Error = Class.new(RuntimeError)
9
+ def extract(content,type)
10
+ extractor = case type
11
+ when :word then Word
12
+ when :excel then Excel
13
+ when :power_point then PowerPoint
14
+ else raise Error.new("not supported")
15
+ end
16
+
17
+ extractor.new(content.to_java_bytes).data.flatten.join("\n")
18
+ end
19
+ module_function :extract
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'java'
2
+
3
+ module DeltaAttack
4
+ module Extractor
5
+ class Base
6
+ attr_accessor :bytes
7
+ def initialize(bytes)
8
+ @bytes = bytes
9
+ end
10
+
11
+ def data(ignore_cache=false)
12
+ return @data if (!ignore_cache) && @data
13
+
14
+ @data = extract_data
15
+ end
16
+
17
+ private
18
+ def java_input_stream
19
+ Java::JavaIo::ByteArrayInputStream.new(@bytes)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,41 @@
1
+ require 'delta_attack/extractor/base'
2
+
3
+ include_class 'org.apache.poi.hssf.usermodel.HSSFWorkbook'
4
+ include_class 'org.apache.poi.hssf.usermodel.HSSFCell'
5
+
6
+ module DeltaAttack
7
+ module Extractor
8
+ class Excel < Base
9
+ private
10
+ def extract_data
11
+ input_stream = java_input_stream
12
+ begin
13
+ book = HSSFWorkbook.new(input_stream)
14
+ return (0...book.number_of_sheets).map do |i|
15
+ extract_sheet(book.sheet_at(i))
16
+ end
17
+ ensure
18
+ input_stream.close
19
+ end
20
+ end
21
+
22
+ def extract_sheet(sheet)
23
+ sheet.iterator.map do |row|
24
+ row.iterator.map{|cell| handle_cell(cell) }
25
+ end
26
+ end
27
+
28
+ def handle_cell(cell)
29
+ case cell.cell_type
30
+ when HSSFCell::CELL_TYPE_NUMERIC
31
+ cell.numeric_cell_value
32
+ when HSSFCell::CELL_TYPE_STRING
33
+ cell.rich_string_cell_value.string
34
+ when HSSFCell::CELL_TYPE_BOOLEAN, HSSFCell::CELL_TYPE_BLANK
35
+ nil
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
@@ -0,0 +1,20 @@
1
+ require 'delta_attack/extractor/base'
2
+
3
+ include_class 'org.apache.poi.hslf.usermodel.SlideShow'
4
+
5
+ module DeltaAttack
6
+ module Extractor
7
+ class PowerPoint < Base
8
+ private
9
+ def extract_data
10
+ input_stream = java_input_stream
11
+ begin
12
+ slide_show = SlideShow.new(input_stream)
13
+ slide_show.slides.map do |slide|
14
+ slide.text_runs.map{|tr| tr.text }
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ require 'webrick/httpservlet'
2
+ require 'delta_attack/extractor'
3
+ require 'delta_attack/filetype_assumption'
4
+
5
+ module DeltaAttack
6
+ module Extractor
7
+ class Servlet < WEBrick::HTTPServlet::AbstractServlet
8
+ def do_GET(req, res)
9
+ res.body = <<-HTML
10
+ <html>
11
+ <head></head>
12
+ <body>
13
+ <form action="/extract" enctype="multipart/form-data" method="post">
14
+ <input type="file" name="file" />
15
+ <input type="submit" name="submit" value="up" />
16
+ </form>
17
+ </body>
18
+ </html>
19
+ HTML
20
+ end
21
+
22
+ def do_POST(req, res)
23
+ f = req.query["file"]
24
+ type = FiletypeAssumption.new(f.filename, f['content-type'])
25
+ begin
26
+ res.body = Extractor.extract(f.to_s, type.filetype)
27
+ res.content_type = "text/plain"
28
+ rescue Extractor::Error => exe
29
+ raise WEBrick::HTTPStatus::BadRequest, exe.message
30
+ rescue StandardError => stdex
31
+ raise WEBrick::HTTPStatus::InternalServerError, stdex.message
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+
@@ -0,0 +1,25 @@
1
+ require 'delta_attack/extractor/base'
2
+
3
+ include_class 'org.apache.poi.hwpf.HWPFDocument'
4
+
5
+ module DeltaAttack
6
+ module Extractor
7
+ class Word < Base
8
+
9
+ private
10
+ def extract_data
11
+ input_stream = java_input_stream
12
+ begin
13
+ book = HWPFDocument.new(input_stream)
14
+ range = book.range
15
+ (0...range.num_paragraphs).map do |i|
16
+ range.paragraph(i).text.strip
17
+ end
18
+ ensure
19
+ input_stream.close
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+
@@ -0,0 +1,46 @@
1
+ begin
2
+ require 'mahoro'
3
+ rescue LoadError
4
+ nil
5
+ end
6
+
7
+ module DeltaAttack
8
+ class FiletypeAssumption
9
+ CONTENT_TYPES = {
10
+ "application/msword" => :word,
11
+ "application/vnd.ms-excel" => :excel,
12
+ "application/vnd.ms-powerpoint" => :power_point,
13
+ }.freeze
14
+
15
+ def self.support_magic?
16
+ defined? Mahoro
17
+ end
18
+
19
+ def initialize(filename, content_type = nil, content = nil)
20
+ @filename = filename
21
+ @content_type = content_type
22
+ @content = content
23
+ end
24
+
25
+ def filetype
26
+ by_content_type || by_extention || :unknown
27
+ end
28
+
29
+ def content_type
30
+ CONTENT_TYPES.index(filetype)
31
+ end
32
+
33
+ private
34
+ def by_content_type
35
+ CONTENT_TYPES[@content_type]
36
+ end
37
+
38
+ def by_extention
39
+ case File.extname(@filename).downcase
40
+ when ".doc" then :word
41
+ when ".xls" then :excel
42
+ when ".ppt" then :power_point
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/vendor/README ADDED
@@ -0,0 +1,8 @@
1
+ Download POI from <http://poi.apache.org/> version =< 3.1
2
+
3
+ and symlink here poi-current.jar
4
+
5
+ example,
6
+
7
+ $ ls -l lib/vendor/
8
+ lrwxr-xr-x 1 you us 40 Sep 12 09:54 poi-current.jar -> poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar
@@ -0,0 +1,23 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/excel'
3
+ require 'java'
4
+ require 'timeout'
5
+
6
+ describe DeltaAttack::Extractor::Excel do
7
+ include SpecHelper
8
+ before do
9
+ content = File.read(sample_data("13TOKYO.xls"))
10
+ @xls = DeltaAttack::Extractor::Excel.new(content.to_java_bytes)
11
+ end
12
+
13
+ it { @xls.bytes.should_not be_nil }
14
+ it "data[0][0].should == 13101" do
15
+ @xls.data[0][0][0].should == 13101
16
+ end
17
+
18
+ it "2nd call of data() should be cached" do
19
+ @xls.data # 1st.
20
+ lambda{ timeout(0.1){ @xls.data } }.should_not raise_error(Timeout::Error)
21
+ end
22
+ end
23
+
@@ -0,0 +1,23 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/power_point'
3
+ require 'java'
4
+ require 'timeout'
5
+
6
+ describe DeltaAttack::Extractor::PowerPoint do
7
+ include SpecHelper
8
+ before do
9
+ content = File.read(sample_data("named_scope06.ppt"))
10
+ @ppt = DeltaAttack::Extractor::PowerPoint.new(content.to_java_bytes)
11
+ end
12
+
13
+ it { @ppt.bytes.should_not be_nil }
14
+ it "data.flatten.first.should == /named_scope/" do
15
+ @ppt.data.flatten.first.should =~ /named_scope/
16
+ end
17
+
18
+ it "2nd call of data() should be cached" do
19
+ @ppt.data # 1st.
20
+ lambda{ timeout(0.1){ @ppt.data } }.should_not raise_error(Timeout::Error)
21
+ end
22
+ end
23
+
@@ -0,0 +1,54 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/servlet'
3
+
4
+ describe DeltaAttack::Extractor::Servlet do
5
+ before do
6
+ @servlet = DeltaAttack::Extractor::Servlet.new("hoge", {})
7
+
8
+ file = mock("upload_file")
9
+ file.should_receive(:filename).and_return("foo.xls")
10
+ file.should_receive(:[]).with("content-type").and_return("application/vnd.ms-excel")
11
+ file.should_receive(:to_s).and_return("DATA-DATA")
12
+
13
+ @req = mock("request")
14
+ @req.should_receive(:query).and_return("file"=>file)
15
+
16
+ @res = Struct.new(:body, :content_type, :status).new
17
+ end
18
+
19
+ describe "pass" do
20
+ before do
21
+ DeltaAttack::Extractor.should_receive(:extract).with("DATA-DATA", :excel).and_return("RESPONSE")
22
+ @servlet.do_POST(@req, @res)
23
+ end
24
+
25
+ it "@res.body.should == 'RESPONSE'" do
26
+ @res.body.should == 'RESPONSE'
27
+ end
28
+
29
+ it "@res.content_type.should == 'text/plain'" do
30
+ @res.body.should == 'RESPONSE'
31
+ end
32
+ end
33
+
34
+ describe "fail with unsupported type" do
35
+ before do
36
+ DeltaAttack::Extractor.should_receive(:extract).and_raise(DeltaAttack::Extractor::Error)
37
+ end
38
+
39
+ it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
40
+ lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::BadRequest)
41
+ end
42
+ end
43
+
44
+ describe "fail with something" do
45
+ before do
46
+ DeltaAttack::Extractor.should_receive(:extract).and_raise(StandardError)
47
+ end
48
+
49
+ it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
50
+ lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::InternalServerError)
51
+ end
52
+ end
53
+ end
54
+
@@ -0,0 +1,24 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/word'
3
+ require 'java'
4
+ require 'timeout'
5
+ $KCODE = "u"
6
+
7
+ describe DeltaAttack::Extractor::Word do
8
+ include SpecHelper
9
+ before do
10
+ content = File.read(sample_data("myblog.doc"))
11
+ @doc = DeltaAttack::Extractor::Word.new(content.to_java_bytes)
12
+ end
13
+
14
+ it { @doc.bytes.should_not be_nil }
15
+ it "data.flatten.first.should =~ /WEBrick/" do
16
+ @doc.data.flatten.first.should =~ /WEBrick/
17
+ end
18
+
19
+ it "2nd call of data() should be cached" do
20
+ @doc.data # 1st.
21
+ lambda{ timeout(0.1){ @doc.data } }.should_not raise_error(Timeout::Error)
22
+ end
23
+ end
24
+
@@ -0,0 +1,26 @@
1
+ require File.expand_path("spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor'
3
+
4
+ describe DeltaAttack::Extractor, ".extract" do
5
+ it "(nil, :unknown).should raise_error(DeltaAttack::Extractor::Error)" do
6
+ lambda{
7
+ DeltaAttack::Extractor.extract(nil, :unknown)
8
+ }.should raise_error(DeltaAttack::Extractor::Error)
9
+ end
10
+
11
+ describe "(mock, :word)" do
12
+ before do
13
+ @content = mock("content")
14
+ @content.should_receive(:to_java_bytes).and_return(%w(a b c))
15
+
16
+ extractor = mock("extractor")
17
+ extractor.should_receive(:data).and_return(%w(a b c))
18
+ DeltaAttack::Extractor::Word.should_receive(:new).with(%w(a b c)).and_return(extractor)
19
+ end
20
+
21
+ it 'should == "a\nb\nc"' do
22
+ DeltaAttack::Extractor.extract(@content, :word)
23
+ end
24
+ end
25
+ end
26
+
@@ -0,0 +1,51 @@
1
+ require File.expand_path("spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/filetype_assumption'
3
+
4
+ describe DeltaAttack::FiletypeAssumption do
5
+ include SpecHelper
6
+ it "should not support_magic" do
7
+ DeltaAttack::FiletypeAssumption.should_not be_support_magic
8
+ end
9
+
10
+ describe "new('hoge.xls')" do
11
+ before do
12
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.xls')
13
+ end
14
+
15
+ it "filetype.should == :excel" do
16
+ @asm.filetype.should == :excel
17
+ end
18
+ end
19
+
20
+ describe "new('hoge.dat', 'application/vnd.ms-excel')" do
21
+ before do
22
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/vnd.ms-excel')
23
+ end
24
+
25
+ it "filetype.should == :excel" do
26
+ @asm.filetype.should == :excel
27
+ end
28
+ end
29
+
30
+ describe "new('hoge.dat', 'application/octet-stream')" do
31
+ before do
32
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream')
33
+ end
34
+
35
+ it "filetype.should == :unknown" do
36
+ @asm.filetype.should == :unknown
37
+ end
38
+ end
39
+
40
+ describe "new('hoge.dat', 'application/octet-stream', <content>)" do
41
+ before do
42
+ content = File.read(sample_data("13TOKYO.xls"))
43
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream', content)
44
+ end
45
+
46
+ it "filetype.should == :excel" do
47
+ pending "mahoro is not installed" unless DeltaAttack::FiletypeAssumption.support_magic?
48
+ @asm.filetype.should == :excel
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ # vim:set fileencoding=utf-8 filetype=ruby
3
+ $KCODE = 'u'
4
+
5
+ require 'rubygems'
6
+ $:.unshift(File.expand_path("../lib", File.dirname(__FILE__)))
7
+
8
+ module SpecHelper
9
+ def sample_data(name)
10
+ File.expand_path("../samples/data/" + name, File.dirname(__FILE__))
11
+ end
12
+ end
13
+
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: delta_attack
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 4
10
+ version: 0.1.4
11
+ platform: ruby
12
+ authors:
13
+ - MOROHASHI Kyosuke
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2008-09-30 00:00:00 -07:00
19
+ default_executable: delta_attack_server
20
+ dependencies: []
21
+
22
+ description: extract text from MS Office document with Apache POI
23
+ email: moronatural@gmail.com
24
+ executables:
25
+ - delta_attack_server
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ - ChangeLog
31
+ files:
32
+ - README
33
+ - NOTICE
34
+ - ChangeLog
35
+ - Rakefile
36
+ - bin/delta_attack_server
37
+ - spec/extractor/excel_spec.rb
38
+ - spec/extractor/power_point_spec.rb
39
+ - spec/extractor/servlet_spec.rb
40
+ - spec/extractor/word_spec.rb
41
+ - spec/extractor_spec.rb
42
+ - spec/filetype_assumption_spec.rb
43
+ - spec/spec_helper.rb
44
+ - lib/delta_attack/client.rb
45
+ - lib/delta_attack/extractor/base.rb
46
+ - lib/delta_attack/extractor/excel.rb
47
+ - lib/delta_attack/extractor/power_point.rb
48
+ - lib/delta_attack/extractor/servlet.rb
49
+ - lib/delta_attack/extractor/word.rb
50
+ - lib/delta_attack/extractor.rb
51
+ - lib/delta_attack/filetype_assumption.rb
52
+ - lib/delta_attack.rb
53
+ - lib/vendor/README
54
+ has_rdoc: true
55
+ homepage: http://github.com/moro/delta_attack
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --title
61
+ - delta_attack documentation
62
+ - --charset
63
+ - utf-8
64
+ - --opname
65
+ - index.html
66
+ - --line-numbers
67
+ - --main
68
+ - README
69
+ - --inline-source
70
+ - --exclude
71
+ - ^(examples|extras)/
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ hash: 3
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ hash: 3
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ requirements: []
93
+
94
+ rubyforge_project:
95
+ rubygems_version: 1.3.7
96
+ signing_key:
97
+ specification_version: 2
98
+ summary: extract text from MS Office document with Apache POI
99
+ test_files: []
100
+