delta_attack 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +4 -0
- data/NOTICE +5 -0
- data/README +50 -0
- data/Rakefile +139 -0
- data/bin/delta_attack_server +66 -0
- data/lib/delta_attack.rb +5 -0
- data/lib/delta_attack/client.rb +65 -0
- data/lib/delta_attack/extractor.rb +21 -0
- data/lib/delta_attack/extractor/base.rb +23 -0
- data/lib/delta_attack/extractor/excel.rb +41 -0
- data/lib/delta_attack/extractor/power_point.rb +20 -0
- data/lib/delta_attack/extractor/servlet.rb +37 -0
- data/lib/delta_attack/extractor/word.rb +25 -0
- data/lib/delta_attack/filetype_assumption.rb +46 -0
- data/lib/vendor/README +8 -0
- data/spec/extractor/excel_spec.rb +23 -0
- data/spec/extractor/power_point_spec.rb +23 -0
- data/spec/extractor/servlet_spec.rb +54 -0
- data/spec/extractor/word_spec.rb +24 -0
- data/spec/extractor_spec.rb +26 -0
- data/spec/filetype_assumption_spec.rb +51 -0
- data/spec/spec_helper.rb +13 -0
- metadata +100 -0
data/ChangeLog
ADDED
data/NOTICE
ADDED
data/README
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
|
2
|
+
= delta_attack
|
3
|
+
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Extract MS Office files to plain text.
|
8
|
+
|
9
|
+
== Installation
|
10
|
+
|
11
|
+
|
12
|
+
=== Archive Installation
|
13
|
+
|
14
|
+
$ rake install
|
15
|
+
|
16
|
+
=== Gem Installation
|
17
|
+
|
18
|
+
$ gem source -a http://gems.github.com
|
19
|
+
$ gem install moro-delta-attack
|
20
|
+
|
21
|
+
== Features/Problems
|
22
|
+
|
23
|
+
Extract MS Office files to plain text usin Apache POI and JRuby.
|
24
|
+
It works with Client/Server architecture.
|
25
|
+
|
26
|
+
The extract server is works on JRuby but the client is works with
|
27
|
+
both cRuby and JRuby.
|
28
|
+
|
29
|
+
This library originally aim to index Office documents to fulltext
|
30
|
+
serach engine.
|
31
|
+
|
32
|
+
== Synopsis
|
33
|
+
|
34
|
+
first you start DeltaAttackServer, which needs JRuby and Apache POI
|
35
|
+
|
36
|
+
$ export CLASSPATH=path/to/poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar:\
|
37
|
+
path/to/poi-3.1-FINAL/poi-scratchpad-3.1-FINAL-20080629.jar
|
38
|
+
$ jruby bin/delta_attack_server
|
39
|
+
|
40
|
+
Then you can use DeltaAttack::Client, in both CRuby(MRI) and JRuby.
|
41
|
+
|
42
|
+
require 'delta_attack/client'
|
43
|
+
DeletaAttack::Client.cast("path/to/some.xls")
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Author:: moro <moronatural@gmail.com>
|
48
|
+
Copyright:: Copyright (c) 2008 moro
|
49
|
+
License:: MIT
|
50
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/testtask'
|
5
|
+
require 'rake/packagetask'
|
6
|
+
require 'rake/gempackagetask'
|
7
|
+
require 'rake/rdoctask'
|
8
|
+
require 'rake/contrib/rubyforgepublisher'
|
9
|
+
require 'rake/contrib/sshpublisher'
|
10
|
+
require 'lib/delta_attack'
|
11
|
+
require 'spec/rake/spectask'
|
12
|
+
require 'fileutils'
|
13
|
+
include FileUtils
|
14
|
+
|
15
|
+
NAME = "delta_attack"
|
16
|
+
AUTHOR = "MOROHASHI Kyosuke"
|
17
|
+
EMAIL = "moronatural@gmail.com"
|
18
|
+
DESCRIPTION = "extract text from MS Office document with Apache POI"
|
19
|
+
# RUBYFORGE_PROJECT = "delta_attack"
|
20
|
+
HOMEPATH = "http://github.com/moro/delta_attack"
|
21
|
+
BIN_FILES = %w( delta_attack_server )
|
22
|
+
VERS = DeltaAttack::VERSION
|
23
|
+
|
24
|
+
|
25
|
+
REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
|
26
|
+
CLEAN.include ['**/.*.sw?', '*.gem', '.config']
|
27
|
+
RDOC_OPTS = [
|
28
|
+
'--title', "#{NAME} documentation",
|
29
|
+
"--charset", "utf-8",
|
30
|
+
"--opname", "index.html",
|
31
|
+
"--line-numbers",
|
32
|
+
"--main", "README",
|
33
|
+
"--inline-source",
|
34
|
+
]
|
35
|
+
|
36
|
+
task :default => [:spec]
|
37
|
+
task :package => [:clean]
|
38
|
+
|
39
|
+
Spec::Rake::SpecTask.new("spec") do |t|
|
40
|
+
t.libs << "spec"
|
41
|
+
t.pattern = "spec/**/*_spec.rb"
|
42
|
+
t.verbose = true
|
43
|
+
end
|
44
|
+
|
45
|
+
spec = Gem::Specification.new do |s|
|
46
|
+
s.name = NAME
|
47
|
+
s.version = VERS
|
48
|
+
s.platform = Gem::Platform::RUBY
|
49
|
+
s.has_rdoc = true
|
50
|
+
s.extra_rdoc_files = ["README", "ChangeLog"]
|
51
|
+
s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
|
52
|
+
s.summary = DESCRIPTION
|
53
|
+
s.description = DESCRIPTION
|
54
|
+
s.author = AUTHOR
|
55
|
+
s.email = EMAIL
|
56
|
+
s.homepage = HOMEPATH
|
57
|
+
s.executables = BIN_FILES
|
58
|
+
# s.rubyforge_project = RUBYFORGE_PROJECT
|
59
|
+
s.bindir = "bin"
|
60
|
+
s.require_path = "lib"
|
61
|
+
s.test_files = Dir["spec/*_test.rb"]
|
62
|
+
|
63
|
+
#s.add_dependency('activesupport', '>=1.3.1')
|
64
|
+
#s.required_ruby_version = '>= 1.8.2'
|
65
|
+
|
66
|
+
s.files = %w(README NOTICE ChangeLog Rakefile) +
|
67
|
+
Dir.glob("{bin,doc,spec,lib,templates,generator,extras,website,script}/**/*") +
|
68
|
+
Dir.glob("tools/*.rb") -
|
69
|
+
Dir.glob("lib/vendor/**/*") +
|
70
|
+
Dir.glob("lib/vendor/README")
|
71
|
+
|
72
|
+
s.extensions = FileList["ext/**/extconf.rb"].to_a
|
73
|
+
end
|
74
|
+
|
75
|
+
Rake::GemPackageTask.new(spec) do |p|
|
76
|
+
p.need_tar = true
|
77
|
+
p.gem_spec = spec
|
78
|
+
end
|
79
|
+
|
80
|
+
task :debug_gem do |p|
|
81
|
+
puts spec.to_ruby
|
82
|
+
end
|
83
|
+
|
84
|
+
task :install do
|
85
|
+
name = "#{NAME}-#{VERS}.gem"
|
86
|
+
sh %{rake package}
|
87
|
+
sh %{sudo gem install pkg/#{name}}
|
88
|
+
end
|
89
|
+
|
90
|
+
task :uninstall => [:clean] do
|
91
|
+
sh %{sudo gem uninstall #{NAME}}
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
Rake::RDocTask.new do |rdoc|
|
96
|
+
rdoc.rdoc_dir = 'html'
|
97
|
+
rdoc.options += RDOC_OPTS
|
98
|
+
rdoc.template = "resh"
|
99
|
+
#rdoc.template = "#{ENV['template']}.rb" if ENV['template']
|
100
|
+
if ENV['DOC_FILES']
|
101
|
+
rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
|
102
|
+
else
|
103
|
+
rdoc.rdoc_files.include('README', 'ChangeLog')
|
104
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
105
|
+
rdoc.rdoc_files.include('ext/**/*.c')
|
106
|
+
end
|
107
|
+
end
|
108
|
+
=begin
|
109
|
+
desc "Publish to RubyForge"
|
110
|
+
task :rubyforge => [:rdoc, :package] do
|
111
|
+
require 'rubyforge'
|
112
|
+
Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'moro').upload
|
113
|
+
end
|
114
|
+
|
115
|
+
desc 'Package and upload the release to rubyforge.'
|
116
|
+
task :release => [:clean, :package] do |t|
|
117
|
+
v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
|
118
|
+
abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
|
119
|
+
pkg = "pkg/#{NAME}-#{VERS}"
|
120
|
+
|
121
|
+
require 'rubyforge'
|
122
|
+
rf = RubyForge.new
|
123
|
+
puts "Logging in"
|
124
|
+
rf.login
|
125
|
+
|
126
|
+
c = rf.userconfig
|
127
|
+
# c["release_notes"] = description if description
|
128
|
+
# c["release_changes"] = changes if changes
|
129
|
+
c["preformatted"] = true
|
130
|
+
|
131
|
+
files = [
|
132
|
+
"#{pkg}.tgz",
|
133
|
+
"#{pkg}.gem"
|
134
|
+
].compact
|
135
|
+
|
136
|
+
puts "Releasing #{NAME} v. #{VERS}"
|
137
|
+
rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
|
138
|
+
end
|
139
|
+
=end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# vim:set fileencoding=utf-8 filetype=ruby
|
3
|
+
$KCODE = 'u'
|
4
|
+
|
5
|
+
require "optparse"
|
6
|
+
require "rbconfig"
|
7
|
+
require "delta_attack"
|
8
|
+
|
9
|
+
module DeltaAttack
|
10
|
+
class Server
|
11
|
+
|
12
|
+
DEFAULT_OPTION = {
|
13
|
+
:port => 3333,
|
14
|
+
:mount => "/extract",
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def self.run(argv)
|
18
|
+
if RbConfig::CONFIG["arch"] =~ /java/i
|
19
|
+
new(argv.dup).run
|
20
|
+
else
|
21
|
+
exec(*["jruby", $0, *argv])
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(argv)
|
26
|
+
@argv = argv
|
27
|
+
@options = DEFAULT_OPTION.dup
|
28
|
+
|
29
|
+
@parser = OptionParser.new do |parser|
|
30
|
+
parser.banner = <<-EOB.gsub(/^\t+/, "")
|
31
|
+
Usage: #$0 [options]
|
32
|
+
EOB
|
33
|
+
|
34
|
+
parser.separator "Options:"
|
35
|
+
parser.on("-p", "--port=PORT", Integer, "specify port default: #{DEFAULT_OPTION[:port]}") do |v|
|
36
|
+
@options[:port] = v
|
37
|
+
end
|
38
|
+
parser.on("-m", "--mount=PATH", String, "mount path of extract servlet #{DEFAULT_OPTION[:mount].dump}") do |v|
|
39
|
+
@options[:mount] = v
|
40
|
+
end
|
41
|
+
|
42
|
+
parser.separator ""
|
43
|
+
|
44
|
+
parser.on("--version", "Show version string `#{VERSION}'") do
|
45
|
+
puts VERSION
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def run
|
52
|
+
@parser.order!(@argv)
|
53
|
+
require 'webrick/httpserver'
|
54
|
+
require 'delta_attack/extractor'
|
55
|
+
require 'delta_attack/extractor/servlet'
|
56
|
+
|
57
|
+
@server = WEBrick::HTTPServer.new(:Port=>@options[:port])
|
58
|
+
@server.mount(@options[:mount], DeltaAttack::Extractor::Servlet)
|
59
|
+
trap("INT"){ @server.shutdown }
|
60
|
+
@server.start
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
DeltaAttack::Server.run(ARGV)
|
66
|
+
|
data/lib/delta_attack.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
|
2
|
+
require 'net/http'
|
3
|
+
require 'delta_attack/filetype_assumption'
|
4
|
+
require 'securerandom'
|
5
|
+
|
6
|
+
module DeltaAttack
|
7
|
+
class Client
|
8
|
+
class << self
|
9
|
+
def cast(filename, content_type = nil, host="localhost", port=3333)
|
10
|
+
cast_buf(nil, filename, content_type, host, port)
|
11
|
+
end
|
12
|
+
alias extract cast
|
13
|
+
|
14
|
+
def cast_buf(content, filename = "no-filename", content_type = nil, host="localhost", port=3333)
|
15
|
+
begin
|
16
|
+
client = new(filename, content)
|
17
|
+
client.content_type = content_type
|
18
|
+
res = Net::HTTP.start(host, port){|http| http.request(client.request) }
|
19
|
+
raise "Request failed #{res}" unless res.is_a? Net::HTTPOK
|
20
|
+
res.body
|
21
|
+
rescue Errno::ECONNREFUSED => e
|
22
|
+
raise "DeltaAttack Server is down on http://#{host}:#{port}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
alias extract_buf cast_buf
|
26
|
+
end
|
27
|
+
|
28
|
+
attr_writer :content_type
|
29
|
+
|
30
|
+
def initialize(filename, content=nil)
|
31
|
+
@filename = filename
|
32
|
+
@content = content
|
33
|
+
end
|
34
|
+
|
35
|
+
def boundary
|
36
|
+
@boundary ||= Digest::SHA1.hexdigest(File.read(__FILE__))[0,8]
|
37
|
+
end
|
38
|
+
|
39
|
+
def content
|
40
|
+
@content ||= File.open(@filename,"rb"){|f| f.read }
|
41
|
+
end
|
42
|
+
|
43
|
+
def content_type
|
44
|
+
@content_type ||= FiletypeAssumption.new(File.basename(@filename)).content_type
|
45
|
+
end
|
46
|
+
|
47
|
+
def body
|
48
|
+
data = ''
|
49
|
+
data << "--#{boundary}\r\n"
|
50
|
+
data << "Content-Disposition: form-data; name=\"file\"; filename=\"#{@filename}\"\r\n"
|
51
|
+
data << "Content-Type: #{content_type}\r\n\r\n"
|
52
|
+
data << content
|
53
|
+
data << "\r\n--#{boundary}--\r\n"
|
54
|
+
end
|
55
|
+
|
56
|
+
def request(path = "/extract" )
|
57
|
+
req = Net::HTTP::Post.new(path)
|
58
|
+
req.content_type = "multipart/form-data; boundary=#{boundary}"
|
59
|
+
req.body = body
|
60
|
+
req.content_length = req.body.size
|
61
|
+
req
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
require 'delta_attack/extractor/word'
|
3
|
+
require 'delta_attack/extractor/excel'
|
4
|
+
require 'delta_attack/extractor/power_point'
|
5
|
+
|
6
|
+
module DeltaAttack
|
7
|
+
module Extractor
|
8
|
+
Error = Class.new(RuntimeError)
|
9
|
+
def extract(content,type)
|
10
|
+
extractor = case type
|
11
|
+
when :word then Word
|
12
|
+
when :excel then Excel
|
13
|
+
when :power_point then PowerPoint
|
14
|
+
else raise Error.new("not supported")
|
15
|
+
end
|
16
|
+
|
17
|
+
extractor.new(content.to_java_bytes).data.flatten.join("\n")
|
18
|
+
end
|
19
|
+
module_function :extract
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
module DeltaAttack
|
4
|
+
module Extractor
|
5
|
+
class Base
|
6
|
+
attr_accessor :bytes
|
7
|
+
def initialize(bytes)
|
8
|
+
@bytes = bytes
|
9
|
+
end
|
10
|
+
|
11
|
+
def data(ignore_cache=false)
|
12
|
+
return @data if (!ignore_cache) && @data
|
13
|
+
|
14
|
+
@data = extract_data
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
def java_input_stream
|
19
|
+
Java::JavaIo::ByteArrayInputStream.new(@bytes)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
|
3
|
+
include_class 'org.apache.poi.hssf.usermodel.HSSFWorkbook'
|
4
|
+
include_class 'org.apache.poi.hssf.usermodel.HSSFCell'
|
5
|
+
|
6
|
+
module DeltaAttack
|
7
|
+
module Extractor
|
8
|
+
class Excel < Base
|
9
|
+
private
|
10
|
+
def extract_data
|
11
|
+
input_stream = java_input_stream
|
12
|
+
begin
|
13
|
+
book = HSSFWorkbook.new(input_stream)
|
14
|
+
return (0...book.number_of_sheets).map do |i|
|
15
|
+
extract_sheet(book.sheet_at(i))
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
input_stream.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_sheet(sheet)
|
23
|
+
sheet.iterator.map do |row|
|
24
|
+
row.iterator.map{|cell| handle_cell(cell) }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def handle_cell(cell)
|
29
|
+
case cell.cell_type
|
30
|
+
when HSSFCell::CELL_TYPE_NUMERIC
|
31
|
+
cell.numeric_cell_value
|
32
|
+
when HSSFCell::CELL_TYPE_STRING
|
33
|
+
cell.rich_string_cell_value.string
|
34
|
+
when HSSFCell::CELL_TYPE_BOOLEAN, HSSFCell::CELL_TYPE_BLANK
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
|
3
|
+
include_class 'org.apache.poi.hslf.usermodel.SlideShow'
|
4
|
+
|
5
|
+
module DeltaAttack
|
6
|
+
module Extractor
|
7
|
+
class PowerPoint < Base
|
8
|
+
private
|
9
|
+
def extract_data
|
10
|
+
input_stream = java_input_stream
|
11
|
+
begin
|
12
|
+
slide_show = SlideShow.new(input_stream)
|
13
|
+
slide_show.slides.map do |slide|
|
14
|
+
slide.text_runs.map{|tr| tr.text }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'webrick/httpservlet'
|
2
|
+
require 'delta_attack/extractor'
|
3
|
+
require 'delta_attack/filetype_assumption'
|
4
|
+
|
5
|
+
module DeltaAttack
|
6
|
+
module Extractor
|
7
|
+
class Servlet < WEBrick::HTTPServlet::AbstractServlet
|
8
|
+
def do_GET(req, res)
|
9
|
+
res.body = <<-HTML
|
10
|
+
<html>
|
11
|
+
<head></head>
|
12
|
+
<body>
|
13
|
+
<form action="/extract" enctype="multipart/form-data" method="post">
|
14
|
+
<input type="file" name="file" />
|
15
|
+
<input type="submit" name="submit" value="up" />
|
16
|
+
</form>
|
17
|
+
</body>
|
18
|
+
</html>
|
19
|
+
HTML
|
20
|
+
end
|
21
|
+
|
22
|
+
def do_POST(req, res)
|
23
|
+
f = req.query["file"]
|
24
|
+
type = FiletypeAssumption.new(f.filename, f['content-type'])
|
25
|
+
begin
|
26
|
+
res.body = Extractor.extract(f.to_s, type.filetype)
|
27
|
+
res.content_type = "text/plain"
|
28
|
+
rescue Extractor::Error => exe
|
29
|
+
raise WEBrick::HTTPStatus::BadRequest, exe.message
|
30
|
+
rescue StandardError => stdex
|
31
|
+
raise WEBrick::HTTPStatus::InternalServerError, stdex.message
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
|
3
|
+
include_class 'org.apache.poi.hwpf.HWPFDocument'
|
4
|
+
|
5
|
+
module DeltaAttack
|
6
|
+
module Extractor
|
7
|
+
class Word < Base
|
8
|
+
|
9
|
+
private
|
10
|
+
def extract_data
|
11
|
+
input_stream = java_input_stream
|
12
|
+
begin
|
13
|
+
book = HWPFDocument.new(input_stream)
|
14
|
+
range = book.range
|
15
|
+
(0...range.num_paragraphs).map do |i|
|
16
|
+
range.paragraph(i).text.strip
|
17
|
+
end
|
18
|
+
ensure
|
19
|
+
input_stream.close
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
begin
|
2
|
+
require 'mahoro'
|
3
|
+
rescue LoadError
|
4
|
+
nil
|
5
|
+
end
|
6
|
+
|
7
|
+
module DeltaAttack
|
8
|
+
class FiletypeAssumption
|
9
|
+
CONTENT_TYPES = {
|
10
|
+
"application/msword" => :word,
|
11
|
+
"application/vnd.ms-excel" => :excel,
|
12
|
+
"application/vnd.ms-powerpoint" => :power_point,
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
def self.support_magic?
|
16
|
+
defined? Mahoro
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(filename, content_type = nil, content = nil)
|
20
|
+
@filename = filename
|
21
|
+
@content_type = content_type
|
22
|
+
@content = content
|
23
|
+
end
|
24
|
+
|
25
|
+
def filetype
|
26
|
+
by_content_type || by_extention || :unknown
|
27
|
+
end
|
28
|
+
|
29
|
+
def content_type
|
30
|
+
CONTENT_TYPES.index(filetype)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def by_content_type
|
35
|
+
CONTENT_TYPES[@content_type]
|
36
|
+
end
|
37
|
+
|
38
|
+
def by_extention
|
39
|
+
case File.extname(@filename).downcase
|
40
|
+
when ".doc" then :word
|
41
|
+
when ".xls" then :excel
|
42
|
+
when ".ppt" then :power_point
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/vendor/README
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/excel'
|
3
|
+
require 'java'
|
4
|
+
require 'timeout'
|
5
|
+
|
6
|
+
describe DeltaAttack::Extractor::Excel do
|
7
|
+
include SpecHelper
|
8
|
+
before do
|
9
|
+
content = File.read(sample_data("13TOKYO.xls"))
|
10
|
+
@xls = DeltaAttack::Extractor::Excel.new(content.to_java_bytes)
|
11
|
+
end
|
12
|
+
|
13
|
+
it { @xls.bytes.should_not be_nil }
|
14
|
+
it "data[0][0].should == 13101" do
|
15
|
+
@xls.data[0][0][0].should == 13101
|
16
|
+
end
|
17
|
+
|
18
|
+
it "2nd call of data() should be cached" do
|
19
|
+
@xls.data # 1st.
|
20
|
+
lambda{ timeout(0.1){ @xls.data } }.should_not raise_error(Timeout::Error)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/power_point'
|
3
|
+
require 'java'
|
4
|
+
require 'timeout'
|
5
|
+
|
6
|
+
describe DeltaAttack::Extractor::PowerPoint do
|
7
|
+
include SpecHelper
|
8
|
+
before do
|
9
|
+
content = File.read(sample_data("named_scope06.ppt"))
|
10
|
+
@ppt = DeltaAttack::Extractor::PowerPoint.new(content.to_java_bytes)
|
11
|
+
end
|
12
|
+
|
13
|
+
it { @ppt.bytes.should_not be_nil }
|
14
|
+
it "data.flatten.first.should == /named_scope/" do
|
15
|
+
@ppt.data.flatten.first.should =~ /named_scope/
|
16
|
+
end
|
17
|
+
|
18
|
+
it "2nd call of data() should be cached" do
|
19
|
+
@ppt.data # 1st.
|
20
|
+
lambda{ timeout(0.1){ @ppt.data } }.should_not raise_error(Timeout::Error)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/servlet'
|
3
|
+
|
4
|
+
describe DeltaAttack::Extractor::Servlet do
|
5
|
+
before do
|
6
|
+
@servlet = DeltaAttack::Extractor::Servlet.new("hoge", {})
|
7
|
+
|
8
|
+
file = mock("upload_file")
|
9
|
+
file.should_receive(:filename).and_return("foo.xls")
|
10
|
+
file.should_receive(:[]).with("content-type").and_return("application/vnd.ms-excel")
|
11
|
+
file.should_receive(:to_s).and_return("DATA-DATA")
|
12
|
+
|
13
|
+
@req = mock("request")
|
14
|
+
@req.should_receive(:query).and_return("file"=>file)
|
15
|
+
|
16
|
+
@res = Struct.new(:body, :content_type, :status).new
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "pass" do
|
20
|
+
before do
|
21
|
+
DeltaAttack::Extractor.should_receive(:extract).with("DATA-DATA", :excel).and_return("RESPONSE")
|
22
|
+
@servlet.do_POST(@req, @res)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "@res.body.should == 'RESPONSE'" do
|
26
|
+
@res.body.should == 'RESPONSE'
|
27
|
+
end
|
28
|
+
|
29
|
+
it "@res.content_type.should == 'text/plain'" do
|
30
|
+
@res.body.should == 'RESPONSE'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "fail with unsupported type" do
|
35
|
+
before do
|
36
|
+
DeltaAttack::Extractor.should_receive(:extract).and_raise(DeltaAttack::Extractor::Error)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
|
40
|
+
lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::BadRequest)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "fail with something" do
|
45
|
+
before do
|
46
|
+
DeltaAttack::Extractor.should_receive(:extract).and_raise(StandardError)
|
47
|
+
end
|
48
|
+
|
49
|
+
it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
|
50
|
+
lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::InternalServerError)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/word'
|
3
|
+
require 'java'
|
4
|
+
require 'timeout'
|
5
|
+
$KCODE = "u"
|
6
|
+
|
7
|
+
describe DeltaAttack::Extractor::Word do
|
8
|
+
include SpecHelper
|
9
|
+
before do
|
10
|
+
content = File.read(sample_data("myblog.doc"))
|
11
|
+
@doc = DeltaAttack::Extractor::Word.new(content.to_java_bytes)
|
12
|
+
end
|
13
|
+
|
14
|
+
it { @doc.bytes.should_not be_nil }
|
15
|
+
it "data.flatten.first.should =~ /WEBrick/" do
|
16
|
+
@doc.data.flatten.first.should =~ /WEBrick/
|
17
|
+
end
|
18
|
+
|
19
|
+
it "2nd call of data() should be cached" do
|
20
|
+
@doc.data # 1st.
|
21
|
+
lambda{ timeout(0.1){ @doc.data } }.should_not raise_error(Timeout::Error)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.expand_path("spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor'
|
3
|
+
|
4
|
+
describe DeltaAttack::Extractor, ".extract" do
|
5
|
+
it "(nil, :unknown).should raise_error(DeltaAttack::Extractor::Error)" do
|
6
|
+
lambda{
|
7
|
+
DeltaAttack::Extractor.extract(nil, :unknown)
|
8
|
+
}.should raise_error(DeltaAttack::Extractor::Error)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "(mock, :word)" do
|
12
|
+
before do
|
13
|
+
@content = mock("content")
|
14
|
+
@content.should_receive(:to_java_bytes).and_return(%w(a b c))
|
15
|
+
|
16
|
+
extractor = mock("extractor")
|
17
|
+
extractor.should_receive(:data).and_return(%w(a b c))
|
18
|
+
DeltaAttack::Extractor::Word.should_receive(:new).with(%w(a b c)).and_return(extractor)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should == "a\nb\nc"' do
|
22
|
+
DeltaAttack::Extractor.extract(@content, :word)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require File.expand_path("spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/filetype_assumption'
|
3
|
+
|
4
|
+
describe DeltaAttack::FiletypeAssumption do
|
5
|
+
include SpecHelper
|
6
|
+
it "should not support_magic" do
|
7
|
+
DeltaAttack::FiletypeAssumption.should_not be_support_magic
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "new('hoge.xls')" do
|
11
|
+
before do
|
12
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.xls')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "filetype.should == :excel" do
|
16
|
+
@asm.filetype.should == :excel
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "new('hoge.dat', 'application/vnd.ms-excel')" do
|
21
|
+
before do
|
22
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/vnd.ms-excel')
|
23
|
+
end
|
24
|
+
|
25
|
+
it "filetype.should == :excel" do
|
26
|
+
@asm.filetype.should == :excel
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "new('hoge.dat', 'application/octet-stream')" do
|
31
|
+
before do
|
32
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream')
|
33
|
+
end
|
34
|
+
|
35
|
+
it "filetype.should == :unknown" do
|
36
|
+
@asm.filetype.should == :unknown
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "new('hoge.dat', 'application/octet-stream', <content>)" do
|
41
|
+
before do
|
42
|
+
content = File.read(sample_data("13TOKYO.xls"))
|
43
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream', content)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "filetype.should == :excel" do
|
47
|
+
pending "mahoro is not installed" unless DeltaAttack::FiletypeAssumption.support_magic?
|
48
|
+
@asm.filetype.should == :excel
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# vim:set fileencoding=utf-8 filetype=ruby
|
3
|
+
$KCODE = 'u'
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
$:.unshift(File.expand_path("../lib", File.dirname(__FILE__)))
|
7
|
+
|
8
|
+
module SpecHelper
|
9
|
+
def sample_data(name)
|
10
|
+
File.expand_path("../samples/data/" + name, File.dirname(__FILE__))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: delta_attack
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- MOROHASHI Kyosuke
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2008-09-30 00:00:00 -07:00
|
19
|
+
default_executable: delta_attack_server
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: extract text from MS Office document with Apache POI
|
23
|
+
email: moronatural@gmail.com
|
24
|
+
executables:
|
25
|
+
- delta_attack_server
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
- ChangeLog
|
31
|
+
files:
|
32
|
+
- README
|
33
|
+
- NOTICE
|
34
|
+
- ChangeLog
|
35
|
+
- Rakefile
|
36
|
+
- bin/delta_attack_server
|
37
|
+
- spec/extractor/excel_spec.rb
|
38
|
+
- spec/extractor/power_point_spec.rb
|
39
|
+
- spec/extractor/servlet_spec.rb
|
40
|
+
- spec/extractor/word_spec.rb
|
41
|
+
- spec/extractor_spec.rb
|
42
|
+
- spec/filetype_assumption_spec.rb
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
- lib/delta_attack/client.rb
|
45
|
+
- lib/delta_attack/extractor/base.rb
|
46
|
+
- lib/delta_attack/extractor/excel.rb
|
47
|
+
- lib/delta_attack/extractor/power_point.rb
|
48
|
+
- lib/delta_attack/extractor/servlet.rb
|
49
|
+
- lib/delta_attack/extractor/word.rb
|
50
|
+
- lib/delta_attack/extractor.rb
|
51
|
+
- lib/delta_attack/filetype_assumption.rb
|
52
|
+
- lib/delta_attack.rb
|
53
|
+
- lib/vendor/README
|
54
|
+
has_rdoc: true
|
55
|
+
homepage: http://github.com/moro/delta_attack
|
56
|
+
licenses: []
|
57
|
+
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options:
|
60
|
+
- --title
|
61
|
+
- delta_attack documentation
|
62
|
+
- --charset
|
63
|
+
- utf-8
|
64
|
+
- --opname
|
65
|
+
- index.html
|
66
|
+
- --line-numbers
|
67
|
+
- --main
|
68
|
+
- README
|
69
|
+
- --inline-source
|
70
|
+
- --exclude
|
71
|
+
- ^(examples|extras)/
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
hash: 3
|
80
|
+
segments:
|
81
|
+
- 0
|
82
|
+
version: "0"
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
hash: 3
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
version: "0"
|
92
|
+
requirements: []
|
93
|
+
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.3.7
|
96
|
+
signing_key:
|
97
|
+
specification_version: 2
|
98
|
+
summary: extract text from MS Office document with Apache POI
|
99
|
+
test_files: []
|
100
|
+
|