delta_attack 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog ADDED
@@ -0,0 +1,4 @@
1
+ == 0.0.1 / 2008-09-12
2
+
3
+ * initial release
4
+
data/NOTICE ADDED
@@ -0,0 +1,5 @@
1
+ This library depends on Apache POI libraries,
2
+ which is provided as Apache Licence version 2.
3
+
4
+ http://poi.apache.org/
5
+
data/README ADDED
@@ -0,0 +1,50 @@
1
+
2
+ = delta_attack
3
+
4
+
5
+ == Description
6
+
7
+ Extract MS Office files to plain text.
8
+
9
+ == Installation
10
+
11
+
12
+ === Archive Installation
13
+
14
+ $ rake install
15
+
16
+ === Gem Installation
17
+
18
+ $ gem source -a http://gems.github.com
19
+ $ gem install moro-delta-attack
20
+
21
+ == Features/Problems
22
+
23
+ Extract MS Office files to plain text usin Apache POI and JRuby.
24
+ It works with Client/Server architecture.
25
+
26
+ The extract server is works on JRuby but the client is works with
27
+ both cRuby and JRuby.
28
+
29
+ This library originally aim to index Office documents to fulltext
30
+ serach engine.
31
+
32
+ == Synopsis
33
+
34
+ first you start DeltaAttackServer, which needs JRuby and Apache POI
35
+
36
+ $ export CLASSPATH=path/to/poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar:\
37
+ path/to/poi-3.1-FINAL/poi-scratchpad-3.1-FINAL-20080629.jar
38
+ $ jruby bin/delta_attack_server
39
+
40
+ Then you can use DeltaAttack::Client, in both CRuby(MRI) and JRuby.
41
+
42
+ require 'delta_attack/client'
43
+ DeletaAttack::Client.cast("path/to/some.xls")
44
+
45
+ == Copyright
46
+
47
+ Author:: moro <moronatural@gmail.com>
48
+ Copyright:: Copyright (c) 2008 moro
49
+ License:: MIT
50
+
data/Rakefile ADDED
@@ -0,0 +1,139 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/rdoctask'
8
+ require 'rake/contrib/rubyforgepublisher'
9
+ require 'rake/contrib/sshpublisher'
10
+ require 'lib/delta_attack'
11
+ require 'spec/rake/spectask'
12
+ require 'fileutils'
13
+ include FileUtils
14
+
15
+ NAME = "delta_attack"
16
+ AUTHOR = "MOROHASHI Kyosuke"
17
+ EMAIL = "moronatural@gmail.com"
18
+ DESCRIPTION = "extract text from MS Office document with Apache POI"
19
+ # RUBYFORGE_PROJECT = "delta_attack"
20
+ HOMEPATH = "http://github.com/moro/delta_attack"
21
+ BIN_FILES = %w( delta_attack_server )
22
+ VERS = DeltaAttack::VERSION
23
+
24
+
25
+ REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
26
+ CLEAN.include ['**/.*.sw?', '*.gem', '.config']
27
+ RDOC_OPTS = [
28
+ '--title', "#{NAME} documentation",
29
+ "--charset", "utf-8",
30
+ "--opname", "index.html",
31
+ "--line-numbers",
32
+ "--main", "README",
33
+ "--inline-source",
34
+ ]
35
+
36
+ task :default => [:spec]
37
+ task :package => [:clean]
38
+
39
+ Spec::Rake::SpecTask.new("spec") do |t|
40
+ t.libs << "spec"
41
+ t.pattern = "spec/**/*_spec.rb"
42
+ t.verbose = true
43
+ end
44
+
45
+ spec = Gem::Specification.new do |s|
46
+ s.name = NAME
47
+ s.version = VERS
48
+ s.platform = Gem::Platform::RUBY
49
+ s.has_rdoc = true
50
+ s.extra_rdoc_files = ["README", "ChangeLog"]
51
+ s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
52
+ s.summary = DESCRIPTION
53
+ s.description = DESCRIPTION
54
+ s.author = AUTHOR
55
+ s.email = EMAIL
56
+ s.homepage = HOMEPATH
57
+ s.executables = BIN_FILES
58
+ # s.rubyforge_project = RUBYFORGE_PROJECT
59
+ s.bindir = "bin"
60
+ s.require_path = "lib"
61
+ s.test_files = Dir["spec/*_test.rb"]
62
+
63
+ #s.add_dependency('activesupport', '>=1.3.1')
64
+ #s.required_ruby_version = '>= 1.8.2'
65
+
66
+ s.files = %w(README NOTICE ChangeLog Rakefile) +
67
+ Dir.glob("{bin,doc,spec,lib,templates,generator,extras,website,script}/**/*") +
68
+ Dir.glob("tools/*.rb") -
69
+ Dir.glob("lib/vendor/**/*") +
70
+ Dir.glob("lib/vendor/README")
71
+
72
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
73
+ end
74
+
75
+ Rake::GemPackageTask.new(spec) do |p|
76
+ p.need_tar = true
77
+ p.gem_spec = spec
78
+ end
79
+
80
+ task :debug_gem do |p|
81
+ puts spec.to_ruby
82
+ end
83
+
84
+ task :install do
85
+ name = "#{NAME}-#{VERS}.gem"
86
+ sh %{rake package}
87
+ sh %{sudo gem install pkg/#{name}}
88
+ end
89
+
90
+ task :uninstall => [:clean] do
91
+ sh %{sudo gem uninstall #{NAME}}
92
+ end
93
+
94
+
95
+ Rake::RDocTask.new do |rdoc|
96
+ rdoc.rdoc_dir = 'html'
97
+ rdoc.options += RDOC_OPTS
98
+ rdoc.template = "resh"
99
+ #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
100
+ if ENV['DOC_FILES']
101
+ rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
102
+ else
103
+ rdoc.rdoc_files.include('README', 'ChangeLog')
104
+ rdoc.rdoc_files.include('lib/**/*.rb')
105
+ rdoc.rdoc_files.include('ext/**/*.c')
106
+ end
107
+ end
108
+ =begin
109
+ desc "Publish to RubyForge"
110
+ task :rubyforge => [:rdoc, :package] do
111
+ require 'rubyforge'
112
+ Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'moro').upload
113
+ end
114
+
115
+ desc 'Package and upload the release to rubyforge.'
116
+ task :release => [:clean, :package] do |t|
117
+ v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
118
+ abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
119
+ pkg = "pkg/#{NAME}-#{VERS}"
120
+
121
+ require 'rubyforge'
122
+ rf = RubyForge.new
123
+ puts "Logging in"
124
+ rf.login
125
+
126
+ c = rf.userconfig
127
+ # c["release_notes"] = description if description
128
+ # c["release_changes"] = changes if changes
129
+ c["preformatted"] = true
130
+
131
+ files = [
132
+ "#{pkg}.tgz",
133
+ "#{pkg}.gem"
134
+ ].compact
135
+
136
+ puts "Releasing #{NAME} v. #{VERS}"
137
+ rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
138
+ end
139
+ =end
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env jruby
2
+ # vim:set fileencoding=utf-8 filetype=ruby
3
+ $KCODE = 'u'
4
+
5
+ require "optparse"
6
+ require "rbconfig"
7
+ require "delta_attack"
8
+
9
+ module DeltaAttack
10
+ class Server
11
+
12
+ DEFAULT_OPTION = {
13
+ :port => 3333,
14
+ :mount => "/extract",
15
+ }.freeze
16
+
17
+ def self.run(argv)
18
+ if RbConfig::CONFIG["arch"] =~ /java/i
19
+ new(argv.dup).run
20
+ else
21
+ exec(*["jruby", $0, *argv])
22
+ end
23
+ end
24
+
25
+ def initialize(argv)
26
+ @argv = argv
27
+ @options = DEFAULT_OPTION.dup
28
+
29
+ @parser = OptionParser.new do |parser|
30
+ parser.banner = <<-EOB.gsub(/^\t+/, "")
31
+ Usage: #$0 [options]
32
+ EOB
33
+
34
+ parser.separator "Options:"
35
+ parser.on("-p", "--port=PORT", Integer, "specify port default: #{DEFAULT_OPTION[:port]}") do |v|
36
+ @options[:port] = v
37
+ end
38
+ parser.on("-m", "--mount=PATH", String, "mount path of extract servlet #{DEFAULT_OPTION[:mount].dump}") do |v|
39
+ @options[:mount] = v
40
+ end
41
+
42
+ parser.separator ""
43
+
44
+ parser.on("--version", "Show version string `#{VERSION}'") do
45
+ puts VERSION
46
+ exit
47
+ end
48
+ end
49
+ end
50
+
51
+ def run
52
+ @parser.order!(@argv)
53
+ require 'webrick/httpserver'
54
+ require 'delta_attack/extractor'
55
+ require 'delta_attack/extractor/servlet'
56
+
57
+ @server = WEBrick::HTTPServer.new(:Port=>@options[:port])
58
+ @server.mount(@options[:mount], DeltaAttack::Extractor::Servlet)
59
+ trap("INT"){ @server.shutdown }
60
+ @server.start
61
+ end
62
+ end
63
+ end
64
+
65
+ DeltaAttack::Server.run(ARGV)
66
+
@@ -0,0 +1,5 @@
1
+
2
+ module DeltaAttack
3
+ VERSION = "0.1.3"
4
+ end
5
+
@@ -0,0 +1,65 @@
1
+
2
+ require 'net/http'
3
+ require 'delta_attack/filetype_assumption'
4
+ require 'securerandom'
5
+
6
+ module DeltaAttack
7
+ class Client
8
+ class << self
9
+ def cast(filename, content_type = nil, host="localhost", port=3333)
10
+ cast_buf(nil, filename, content_type, host, port)
11
+ end
12
+ alias extract cast
13
+
14
+ def cast_buf(content, filename = "no-filename", content_type = nil, host="localhost", port=3333)
15
+ begin
16
+ client = new(filename, content)
17
+ client.content_type = content_type
18
+ res = Net::HTTP.start(host, port){|http| http.request(client.request) }
19
+ raise "Request failed #{res}" unless res.is_a? Net::HTTPOK
20
+ res.body
21
+ rescue Errno::ECONNREFUSED => e
22
+ raise "DeltaAttack Server is down on http://#{host}:#{port}"
23
+ end
24
+ end
25
+ alias extract_buf cast_buf
26
+ end
27
+
28
+ attr_writer :content_type
29
+
30
+ def initialize(filename, content=nil)
31
+ @filename = filename
32
+ @content = content
33
+ end
34
+
35
+ def boundary
36
+ @boundary ||= Digest::SHA1.hexdigest(File.read(__FILE__))[0,8]
37
+ end
38
+
39
+ def content
40
+ @content ||= File.open(@filename,"rb"){|f| f.read }
41
+ end
42
+
43
+ def content_type
44
+ @content_type ||= FiletypeAssumption.new(File.basename(@filename)).content_type
45
+ end
46
+
47
+ def body
48
+ data = ''
49
+ data << "--#{boundary}\r\n"
50
+ data << "Content-Disposition: form-data; name=\"file\"; filename=\"#{@filename}\"\r\n"
51
+ data << "Content-Type: #{content_type}\r\n\r\n"
52
+ data << content
53
+ data << "\r\n--#{boundary}--\r\n"
54
+ end
55
+
56
+ def request(path = "/extract" )
57
+ req = Net::HTTP::Post.new(path)
58
+ req.content_type = "multipart/form-data; boundary=#{boundary}"
59
+ req.body = body
60
+ req.content_length = req.body.size
61
+ req
62
+ end
63
+ end
64
+ end
65
+
@@ -0,0 +1,21 @@
1
+ require 'delta_attack/extractor/base'
2
+ require 'delta_attack/extractor/word'
3
+ require 'delta_attack/extractor/excel'
4
+ require 'delta_attack/extractor/power_point'
5
+
6
+ module DeltaAttack
7
+ module Extractor
8
+ Error = Class.new(RuntimeError)
9
+ def extract(content,type)
10
+ extractor = case type
11
+ when :word then Word
12
+ when :excel then Excel
13
+ when :power_point then PowerPoint
14
+ else raise Error.new("not supported")
15
+ end
16
+
17
+ extractor.new(content.to_java_bytes).data.flatten.join("\n")
18
+ end
19
+ module_function :extract
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ require 'java'
2
+
3
+ module DeltaAttack
4
+ module Extractor
5
+ class Base
6
+ attr_accessor :bytes
7
+ def initialize(bytes)
8
+ @bytes = bytes
9
+ end
10
+
11
+ def data(ignore_cache=false)
12
+ return @data if (!ignore_cache) && @data
13
+
14
+ @data = extract_data
15
+ end
16
+
17
+ private
18
+ def java_input_stream
19
+ Java::JavaIo::ByteArrayInputStream.new(@bytes)
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,41 @@
1
+ require 'delta_attack/extractor/base'
2
+
3
+ include_class 'org.apache.poi.hssf.usermodel.HSSFWorkbook'
4
+ include_class 'org.apache.poi.hssf.usermodel.HSSFCell'
5
+
6
+ module DeltaAttack
7
+ module Extractor
8
+ class Excel < Base
9
+ private
10
+ def extract_data
11
+ input_stream = java_input_stream
12
+ begin
13
+ book = HSSFWorkbook.new(input_stream)
14
+ return (0...book.number_of_sheets).map do |i|
15
+ extract_sheet(book.sheet_at(i))
16
+ end
17
+ ensure
18
+ input_stream.close
19
+ end
20
+ end
21
+
22
+ def extract_sheet(sheet)
23
+ sheet.iterator.map do |row|
24
+ row.iterator.map{|cell| handle_cell(cell) }
25
+ end
26
+ end
27
+
28
+ def handle_cell(cell)
29
+ case cell.cell_type
30
+ when HSSFCell::CELL_TYPE_NUMERIC
31
+ cell.numeric_cell_value
32
+ when HSSFCell::CELL_TYPE_STRING
33
+ cell.rich_string_cell_value.string
34
+ when HSSFCell::CELL_TYPE_BOOLEAN, HSSFCell::CELL_TYPE_BLANK
35
+ nil
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+
@@ -0,0 +1,20 @@
1
+ require 'delta_attack/extractor/base'
2
+
3
+ include_class 'org.apache.poi.hslf.usermodel.SlideShow'
4
+
5
+ module DeltaAttack
6
+ module Extractor
7
+ class PowerPoint < Base
8
+ private
9
+ def extract_data
10
+ input_stream = java_input_stream
11
+ begin
12
+ slide_show = SlideShow.new(input_stream)
13
+ slide_show.slides.map do |slide|
14
+ slide.text_runs.map{|tr| tr.text }
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ require 'webrick/httpservlet'
2
+ require 'delta_attack/extractor'
3
+ require 'delta_attack/filetype_assumption'
4
+
5
+ module DeltaAttack
6
+ module Extractor
7
+ class Servlet < WEBrick::HTTPServlet::AbstractServlet
8
+ def do_GET(req, res)
9
+ res.body = <<-HTML
10
+ <html>
11
+ <head></head>
12
+ <body>
13
+ <form action="/extract" enctype="multipart/form-data" method="post">
14
+ <input type="file" name="file" />
15
+ <input type="submit" name="submit" value="up" />
16
+ </form>
17
+ </body>
18
+ </html>
19
+ HTML
20
+ end
21
+
22
+ def do_POST(req, res)
23
+ f = req.query["file"]
24
+ type = FiletypeAssumption.new(f.filename, f['content-type'])
25
+ begin
26
+ res.body = Extractor.extract(f.to_s, type.filetype)
27
+ res.content_type = "text/plain"
28
+ rescue Extractor::Error => exe
29
+ raise WEBrick::HTTPStatus::BadRequest, exe.message
30
+ rescue StandardError => stdex
31
+ raise WEBrick::HTTPStatus::InternalServerError, stdex.message
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+
@@ -0,0 +1,25 @@
1
+ require 'delta_attack/extractor/base'
2
+
3
+ include_class 'org.apache.poi.hwpf.HWPFDocument'
4
+
5
+ module DeltaAttack
6
+ module Extractor
7
+ class Word < Base
8
+
9
+ private
10
+ def extract_data
11
+ input_stream = java_input_stream
12
+ begin
13
+ book = HWPFDocument.new(input_stream)
14
+ range = book.range
15
+ (0...range.num_paragraphs).map do |i|
16
+ range.paragraph(i).text.strip
17
+ end
18
+ ensure
19
+ input_stream.close
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+
@@ -0,0 +1,46 @@
1
+ begin
2
+ require 'mahoro'
3
+ rescue LoadError
4
+ nil
5
+ end
6
+
7
+ module DeltaAttack
8
+ class FiletypeAssumption
9
+ CONTENT_TYPES = {
10
+ "application/msword" => :word,
11
+ "application/vnd.ms-excel" => :excel,
12
+ "application/vnd.ms-powerpoint" => :power_point,
13
+ }.freeze
14
+
15
+ def self.support_magic?
16
+ defined? Mahoro
17
+ end
18
+
19
+ def initialize(filename, content_type = nil, content = nil)
20
+ @filename = filename
21
+ @content_type = content_type
22
+ @content = content
23
+ end
24
+
25
+ def filetype
26
+ by_content_type || by_extention || :unknown
27
+ end
28
+
29
+ def content_type
30
+ CONTENT_TYPES.index(filetype)
31
+ end
32
+
33
+ private
34
+ def by_content_type
35
+ CONTENT_TYPES[@content_type]
36
+ end
37
+
38
+ def by_extention
39
+ case File.extname(@filename).downcase
40
+ when ".doc" then :word
41
+ when ".xls" then :excel
42
+ when ".ppt" then :power_point
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/vendor/README ADDED
@@ -0,0 +1,8 @@
1
+ Download POI from <http://poi.apache.org/> version =< 3.1
2
+
3
+ and symlink here poi-current.jar
4
+
5
+ example,
6
+
7
+ $ ls -l lib/vendor/
8
+ lrwxr-xr-x 1 you us 40 Sep 12 09:54 poi-current.jar -> poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar
@@ -0,0 +1,23 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/excel'
3
+ require 'java'
4
+ require 'timeout'
5
+
6
+ describe DeltaAttack::Extractor::Excel do
7
+ include SpecHelper
8
+ before do
9
+ content = File.read(sample_data("13TOKYO.xls"))
10
+ @xls = DeltaAttack::Extractor::Excel.new(content.to_java_bytes)
11
+ end
12
+
13
+ it { @xls.bytes.should_not be_nil }
14
+ it "data[0][0].should == 13101" do
15
+ @xls.data[0][0][0].should == 13101
16
+ end
17
+
18
+ it "2nd call of data() should be cached" do
19
+ @xls.data # 1st.
20
+ lambda{ timeout(0.1){ @xls.data } }.should_not raise_error(Timeout::Error)
21
+ end
22
+ end
23
+
@@ -0,0 +1,23 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/power_point'
3
+ require 'java'
4
+ require 'timeout'
5
+
6
+ describe DeltaAttack::Extractor::PowerPoint do
7
+ include SpecHelper
8
+ before do
9
+ content = File.read(sample_data("named_scope06.ppt"))
10
+ @ppt = DeltaAttack::Extractor::PowerPoint.new(content.to_java_bytes)
11
+ end
12
+
13
+ it { @ppt.bytes.should_not be_nil }
14
+ it "data.flatten.first.should == /named_scope/" do
15
+ @ppt.data.flatten.first.should =~ /named_scope/
16
+ end
17
+
18
+ it "2nd call of data() should be cached" do
19
+ @ppt.data # 1st.
20
+ lambda{ timeout(0.1){ @ppt.data } }.should_not raise_error(Timeout::Error)
21
+ end
22
+ end
23
+
@@ -0,0 +1,54 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/servlet'
3
+
4
+ describe DeltaAttack::Extractor::Servlet do
5
+ before do
6
+ @servlet = DeltaAttack::Extractor::Servlet.new("hoge", {})
7
+
8
+ file = mock("upload_file")
9
+ file.should_receive(:filename).and_return("foo.xls")
10
+ file.should_receive(:[]).with("content-type").and_return("application/vnd.ms-excel")
11
+ file.should_receive(:to_s).and_return("DATA-DATA")
12
+
13
+ @req = mock("request")
14
+ @req.should_receive(:query).and_return("file"=>file)
15
+
16
+ @res = Struct.new(:body, :content_type, :status).new
17
+ end
18
+
19
+ describe "pass" do
20
+ before do
21
+ DeltaAttack::Extractor.should_receive(:extract).with("DATA-DATA", :excel).and_return("RESPONSE")
22
+ @servlet.do_POST(@req, @res)
23
+ end
24
+
25
+ it "@res.body.should == 'RESPONSE'" do
26
+ @res.body.should == 'RESPONSE'
27
+ end
28
+
29
+ it "@res.content_type.should == 'text/plain'" do
30
+ @res.body.should == 'RESPONSE'
31
+ end
32
+ end
33
+
34
+ describe "fail with unsupported type" do
35
+ before do
36
+ DeltaAttack::Extractor.should_receive(:extract).and_raise(DeltaAttack::Extractor::Error)
37
+ end
38
+
39
+ it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
40
+ lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::BadRequest)
41
+ end
42
+ end
43
+
44
+ describe "fail with something" do
45
+ before do
46
+ DeltaAttack::Extractor.should_receive(:extract).and_raise(StandardError)
47
+ end
48
+
49
+ it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
50
+ lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::InternalServerError)
51
+ end
52
+ end
53
+ end
54
+
@@ -0,0 +1,24 @@
1
+ require File.expand_path("../spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor/word'
3
+ require 'java'
4
+ require 'timeout'
5
+ $KCODE = "u"
6
+
7
+ describe DeltaAttack::Extractor::Word do
8
+ include SpecHelper
9
+ before do
10
+ content = File.read(sample_data("myblog.doc"))
11
+ @doc = DeltaAttack::Extractor::Word.new(content.to_java_bytes)
12
+ end
13
+
14
+ it { @doc.bytes.should_not be_nil }
15
+ it "data.flatten.first.should =~ /WEBrick/" do
16
+ @doc.data.flatten.first.should =~ /WEBrick/
17
+ end
18
+
19
+ it "2nd call of data() should be cached" do
20
+ @doc.data # 1st.
21
+ lambda{ timeout(0.1){ @doc.data } }.should_not raise_error(Timeout::Error)
22
+ end
23
+ end
24
+
@@ -0,0 +1,26 @@
1
+ require File.expand_path("spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/extractor'
3
+
4
+ describe DeltaAttack::Extractor, ".extract" do
5
+ it "(nil, :unknown).should raise_error(DeltaAttack::Extractor::Error)" do
6
+ lambda{
7
+ DeltaAttack::Extractor.extract(nil, :unknown)
8
+ }.should raise_error(DeltaAttack::Extractor::Error)
9
+ end
10
+
11
+ describe "(mock, :word)" do
12
+ before do
13
+ @content = mock("content")
14
+ @content.should_receive(:to_java_bytes).and_return(%w(a b c))
15
+
16
+ extractor = mock("extractor")
17
+ extractor.should_receive(:data).and_return(%w(a b c))
18
+ DeltaAttack::Extractor::Word.should_receive(:new).with(%w(a b c)).and_return(extractor)
19
+ end
20
+
21
+ it 'should == "a\nb\nc"' do
22
+ DeltaAttack::Extractor.extract(@content, :word)
23
+ end
24
+ end
25
+ end
26
+
@@ -0,0 +1,51 @@
1
+ require File.expand_path("spec_helper", File.dirname(__FILE__))
2
+ require 'delta_attack/filetype_assumption'
3
+
4
+ describe DeltaAttack::FiletypeAssumption do
5
+ include SpecHelper
6
+ it "should not support_magic" do
7
+ DeltaAttack::FiletypeAssumption.should_not be_support_magic
8
+ end
9
+
10
+ describe "new('hoge.xls')" do
11
+ before do
12
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.xls')
13
+ end
14
+
15
+ it "filetype.should == :excel" do
16
+ @asm.filetype.should == :excel
17
+ end
18
+ end
19
+
20
+ describe "new('hoge.dat', 'application/vnd.ms-excel')" do
21
+ before do
22
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/vnd.ms-excel')
23
+ end
24
+
25
+ it "filetype.should == :excel" do
26
+ @asm.filetype.should == :excel
27
+ end
28
+ end
29
+
30
+ describe "new('hoge.dat', 'application/octet-stream')" do
31
+ before do
32
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream')
33
+ end
34
+
35
+ it "filetype.should == :unknown" do
36
+ @asm.filetype.should == :unknown
37
+ end
38
+ end
39
+
40
+ describe "new('hoge.dat', 'application/octet-stream', <content>)" do
41
+ before do
42
+ content = File.read(sample_data("13TOKYO.xls"))
43
+ @asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream', content)
44
+ end
45
+
46
+ it "filetype.should == :excel" do
47
+ pending "mahoro is not installed" unless DeltaAttack::FiletypeAssumption.support_magic?
48
+ @asm.filetype.should == :excel
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ # vim:set fileencoding=utf-8 filetype=ruby
3
+ $KCODE = 'u'
4
+
5
+ require 'rubygems'
6
+ $:.unshift(File.expand_path("../lib", File.dirname(__FILE__)))
7
+
8
+ module SpecHelper
9
+ def sample_data(name)
10
+ File.expand_path("../samples/data/" + name, File.dirname(__FILE__))
11
+ end
12
+ end
13
+
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: delta_attack
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 4
10
+ version: 0.1.4
11
+ platform: ruby
12
+ authors:
13
+ - MOROHASHI Kyosuke
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2008-09-30 00:00:00 -07:00
19
+ default_executable: delta_attack_server
20
+ dependencies: []
21
+
22
+ description: extract text from MS Office document with Apache POI
23
+ email: moronatural@gmail.com
24
+ executables:
25
+ - delta_attack_server
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ - ChangeLog
31
+ files:
32
+ - README
33
+ - NOTICE
34
+ - ChangeLog
35
+ - Rakefile
36
+ - bin/delta_attack_server
37
+ - spec/extractor/excel_spec.rb
38
+ - spec/extractor/power_point_spec.rb
39
+ - spec/extractor/servlet_spec.rb
40
+ - spec/extractor/word_spec.rb
41
+ - spec/extractor_spec.rb
42
+ - spec/filetype_assumption_spec.rb
43
+ - spec/spec_helper.rb
44
+ - lib/delta_attack/client.rb
45
+ - lib/delta_attack/extractor/base.rb
46
+ - lib/delta_attack/extractor/excel.rb
47
+ - lib/delta_attack/extractor/power_point.rb
48
+ - lib/delta_attack/extractor/servlet.rb
49
+ - lib/delta_attack/extractor/word.rb
50
+ - lib/delta_attack/extractor.rb
51
+ - lib/delta_attack/filetype_assumption.rb
52
+ - lib/delta_attack.rb
53
+ - lib/vendor/README
54
+ has_rdoc: true
55
+ homepage: http://github.com/moro/delta_attack
56
+ licenses: []
57
+
58
+ post_install_message:
59
+ rdoc_options:
60
+ - --title
61
+ - delta_attack documentation
62
+ - --charset
63
+ - utf-8
64
+ - --opname
65
+ - index.html
66
+ - --line-numbers
67
+ - --main
68
+ - README
69
+ - --inline-source
70
+ - --exclude
71
+ - ^(examples|extras)/
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ hash: 3
80
+ segments:
81
+ - 0
82
+ version: "0"
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ none: false
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ hash: 3
89
+ segments:
90
+ - 0
91
+ version: "0"
92
+ requirements: []
93
+
94
+ rubyforge_project:
95
+ rubygems_version: 1.3.7
96
+ signing_key:
97
+ specification_version: 2
98
+ summary: extract text from MS Office document with Apache POI
99
+ test_files: []
100
+