delta_attack 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +4 -0
- data/NOTICE +5 -0
- data/README +50 -0
- data/Rakefile +139 -0
- data/bin/delta_attack_server +66 -0
- data/lib/delta_attack.rb +5 -0
- data/lib/delta_attack/client.rb +65 -0
- data/lib/delta_attack/extractor.rb +21 -0
- data/lib/delta_attack/extractor/base.rb +23 -0
- data/lib/delta_attack/extractor/excel.rb +41 -0
- data/lib/delta_attack/extractor/power_point.rb +20 -0
- data/lib/delta_attack/extractor/servlet.rb +37 -0
- data/lib/delta_attack/extractor/word.rb +25 -0
- data/lib/delta_attack/filetype_assumption.rb +46 -0
- data/lib/vendor/README +8 -0
- data/spec/extractor/excel_spec.rb +23 -0
- data/spec/extractor/power_point_spec.rb +23 -0
- data/spec/extractor/servlet_spec.rb +54 -0
- data/spec/extractor/word_spec.rb +24 -0
- data/spec/extractor_spec.rb +26 -0
- data/spec/filetype_assumption_spec.rb +51 -0
- data/spec/spec_helper.rb +13 -0
- metadata +100 -0
data/ChangeLog
ADDED
data/NOTICE
ADDED
data/README
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
|
2
|
+
= delta_attack
|
3
|
+
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
Extract MS Office files to plain text.
|
8
|
+
|
9
|
+
== Installation
|
10
|
+
|
11
|
+
|
12
|
+
=== Archive Installation
|
13
|
+
|
14
|
+
$ rake install
|
15
|
+
|
16
|
+
=== Gem Installation
|
17
|
+
|
18
|
+
$ gem source -a http://gems.github.com
|
19
|
+
$ gem install moro-delta-attack
|
20
|
+
|
21
|
+
== Features/Problems
|
22
|
+
|
23
|
+
Extract MS Office files to plain text usin Apache POI and JRuby.
|
24
|
+
It works with Client/Server architecture.
|
25
|
+
|
26
|
+
The extract server is works on JRuby but the client is works with
|
27
|
+
both cRuby and JRuby.
|
28
|
+
|
29
|
+
This library originally aim to index Office documents to fulltext
|
30
|
+
serach engine.
|
31
|
+
|
32
|
+
== Synopsis
|
33
|
+
|
34
|
+
first you start DeltaAttackServer, which needs JRuby and Apache POI
|
35
|
+
|
36
|
+
$ export CLASSPATH=path/to/poi-3.1-FINAL/poi-3.1-FINAL-20080629.jar:\
|
37
|
+
path/to/poi-3.1-FINAL/poi-scratchpad-3.1-FINAL-20080629.jar
|
38
|
+
$ jruby bin/delta_attack_server
|
39
|
+
|
40
|
+
Then you can use DeltaAttack::Client, in both CRuby(MRI) and JRuby.
|
41
|
+
|
42
|
+
require 'delta_attack/client'
|
43
|
+
DeletaAttack::Client.cast("path/to/some.xls")
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Author:: moro <moronatural@gmail.com>
|
48
|
+
Copyright:: Copyright (c) 2008 moro
|
49
|
+
License:: MIT
|
50
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/testtask'
|
5
|
+
require 'rake/packagetask'
|
6
|
+
require 'rake/gempackagetask'
|
7
|
+
require 'rake/rdoctask'
|
8
|
+
require 'rake/contrib/rubyforgepublisher'
|
9
|
+
require 'rake/contrib/sshpublisher'
|
10
|
+
require 'lib/delta_attack'
|
11
|
+
require 'spec/rake/spectask'
|
12
|
+
require 'fileutils'
|
13
|
+
include FileUtils
|
14
|
+
|
15
|
+
NAME = "delta_attack"
|
16
|
+
AUTHOR = "MOROHASHI Kyosuke"
|
17
|
+
EMAIL = "moronatural@gmail.com"
|
18
|
+
DESCRIPTION = "extract text from MS Office document with Apache POI"
|
19
|
+
# RUBYFORGE_PROJECT = "delta_attack"
|
20
|
+
HOMEPATH = "http://github.com/moro/delta_attack"
|
21
|
+
BIN_FILES = %w( delta_attack_server )
|
22
|
+
VERS = DeltaAttack::VERSION
|
23
|
+
|
24
|
+
|
25
|
+
REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
|
26
|
+
CLEAN.include ['**/.*.sw?', '*.gem', '.config']
|
27
|
+
RDOC_OPTS = [
|
28
|
+
'--title', "#{NAME} documentation",
|
29
|
+
"--charset", "utf-8",
|
30
|
+
"--opname", "index.html",
|
31
|
+
"--line-numbers",
|
32
|
+
"--main", "README",
|
33
|
+
"--inline-source",
|
34
|
+
]
|
35
|
+
|
36
|
+
task :default => [:spec]
|
37
|
+
task :package => [:clean]
|
38
|
+
|
39
|
+
Spec::Rake::SpecTask.new("spec") do |t|
|
40
|
+
t.libs << "spec"
|
41
|
+
t.pattern = "spec/**/*_spec.rb"
|
42
|
+
t.verbose = true
|
43
|
+
end
|
44
|
+
|
45
|
+
spec = Gem::Specification.new do |s|
|
46
|
+
s.name = NAME
|
47
|
+
s.version = VERS
|
48
|
+
s.platform = Gem::Platform::RUBY
|
49
|
+
s.has_rdoc = true
|
50
|
+
s.extra_rdoc_files = ["README", "ChangeLog"]
|
51
|
+
s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
|
52
|
+
s.summary = DESCRIPTION
|
53
|
+
s.description = DESCRIPTION
|
54
|
+
s.author = AUTHOR
|
55
|
+
s.email = EMAIL
|
56
|
+
s.homepage = HOMEPATH
|
57
|
+
s.executables = BIN_FILES
|
58
|
+
# s.rubyforge_project = RUBYFORGE_PROJECT
|
59
|
+
s.bindir = "bin"
|
60
|
+
s.require_path = "lib"
|
61
|
+
s.test_files = Dir["spec/*_test.rb"]
|
62
|
+
|
63
|
+
#s.add_dependency('activesupport', '>=1.3.1')
|
64
|
+
#s.required_ruby_version = '>= 1.8.2'
|
65
|
+
|
66
|
+
s.files = %w(README NOTICE ChangeLog Rakefile) +
|
67
|
+
Dir.glob("{bin,doc,spec,lib,templates,generator,extras,website,script}/**/*") +
|
68
|
+
Dir.glob("tools/*.rb") -
|
69
|
+
Dir.glob("lib/vendor/**/*") +
|
70
|
+
Dir.glob("lib/vendor/README")
|
71
|
+
|
72
|
+
s.extensions = FileList["ext/**/extconf.rb"].to_a
|
73
|
+
end
|
74
|
+
|
75
|
+
Rake::GemPackageTask.new(spec) do |p|
|
76
|
+
p.need_tar = true
|
77
|
+
p.gem_spec = spec
|
78
|
+
end
|
79
|
+
|
80
|
+
task :debug_gem do |p|
|
81
|
+
puts spec.to_ruby
|
82
|
+
end
|
83
|
+
|
84
|
+
task :install do
|
85
|
+
name = "#{NAME}-#{VERS}.gem"
|
86
|
+
sh %{rake package}
|
87
|
+
sh %{sudo gem install pkg/#{name}}
|
88
|
+
end
|
89
|
+
|
90
|
+
task :uninstall => [:clean] do
|
91
|
+
sh %{sudo gem uninstall #{NAME}}
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
Rake::RDocTask.new do |rdoc|
|
96
|
+
rdoc.rdoc_dir = 'html'
|
97
|
+
rdoc.options += RDOC_OPTS
|
98
|
+
rdoc.template = "resh"
|
99
|
+
#rdoc.template = "#{ENV['template']}.rb" if ENV['template']
|
100
|
+
if ENV['DOC_FILES']
|
101
|
+
rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
|
102
|
+
else
|
103
|
+
rdoc.rdoc_files.include('README', 'ChangeLog')
|
104
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
105
|
+
rdoc.rdoc_files.include('ext/**/*.c')
|
106
|
+
end
|
107
|
+
end
|
108
|
+
=begin
|
109
|
+
desc "Publish to RubyForge"
|
110
|
+
task :rubyforge => [:rdoc, :package] do
|
111
|
+
require 'rubyforge'
|
112
|
+
Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'moro').upload
|
113
|
+
end
|
114
|
+
|
115
|
+
desc 'Package and upload the release to rubyforge.'
|
116
|
+
task :release => [:clean, :package] do |t|
|
117
|
+
v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
|
118
|
+
abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
|
119
|
+
pkg = "pkg/#{NAME}-#{VERS}"
|
120
|
+
|
121
|
+
require 'rubyforge'
|
122
|
+
rf = RubyForge.new
|
123
|
+
puts "Logging in"
|
124
|
+
rf.login
|
125
|
+
|
126
|
+
c = rf.userconfig
|
127
|
+
# c["release_notes"] = description if description
|
128
|
+
# c["release_changes"] = changes if changes
|
129
|
+
c["preformatted"] = true
|
130
|
+
|
131
|
+
files = [
|
132
|
+
"#{pkg}.tgz",
|
133
|
+
"#{pkg}.gem"
|
134
|
+
].compact
|
135
|
+
|
136
|
+
puts "Releasing #{NAME} v. #{VERS}"
|
137
|
+
rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
|
138
|
+
end
|
139
|
+
=end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# vim:set fileencoding=utf-8 filetype=ruby
|
3
|
+
$KCODE = 'u'
|
4
|
+
|
5
|
+
require "optparse"
|
6
|
+
require "rbconfig"
|
7
|
+
require "delta_attack"
|
8
|
+
|
9
|
+
module DeltaAttack
|
10
|
+
class Server
|
11
|
+
|
12
|
+
DEFAULT_OPTION = {
|
13
|
+
:port => 3333,
|
14
|
+
:mount => "/extract",
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def self.run(argv)
|
18
|
+
if RbConfig::CONFIG["arch"] =~ /java/i
|
19
|
+
new(argv.dup).run
|
20
|
+
else
|
21
|
+
exec(*["jruby", $0, *argv])
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize(argv)
|
26
|
+
@argv = argv
|
27
|
+
@options = DEFAULT_OPTION.dup
|
28
|
+
|
29
|
+
@parser = OptionParser.new do |parser|
|
30
|
+
parser.banner = <<-EOB.gsub(/^\t+/, "")
|
31
|
+
Usage: #$0 [options]
|
32
|
+
EOB
|
33
|
+
|
34
|
+
parser.separator "Options:"
|
35
|
+
parser.on("-p", "--port=PORT", Integer, "specify port default: #{DEFAULT_OPTION[:port]}") do |v|
|
36
|
+
@options[:port] = v
|
37
|
+
end
|
38
|
+
parser.on("-m", "--mount=PATH", String, "mount path of extract servlet #{DEFAULT_OPTION[:mount].dump}") do |v|
|
39
|
+
@options[:mount] = v
|
40
|
+
end
|
41
|
+
|
42
|
+
parser.separator ""
|
43
|
+
|
44
|
+
parser.on("--version", "Show version string `#{VERSION}'") do
|
45
|
+
puts VERSION
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def run
|
52
|
+
@parser.order!(@argv)
|
53
|
+
require 'webrick/httpserver'
|
54
|
+
require 'delta_attack/extractor'
|
55
|
+
require 'delta_attack/extractor/servlet'
|
56
|
+
|
57
|
+
@server = WEBrick::HTTPServer.new(:Port=>@options[:port])
|
58
|
+
@server.mount(@options[:mount], DeltaAttack::Extractor::Servlet)
|
59
|
+
trap("INT"){ @server.shutdown }
|
60
|
+
@server.start
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
DeltaAttack::Server.run(ARGV)
|
66
|
+
|
data/lib/delta_attack.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
|
2
|
+
require 'net/http'
|
3
|
+
require 'delta_attack/filetype_assumption'
|
4
|
+
require 'securerandom'
|
5
|
+
|
6
|
+
module DeltaAttack
|
7
|
+
class Client
|
8
|
+
class << self
|
9
|
+
def cast(filename, content_type = nil, host="localhost", port=3333)
|
10
|
+
cast_buf(nil, filename, content_type, host, port)
|
11
|
+
end
|
12
|
+
alias extract cast
|
13
|
+
|
14
|
+
def cast_buf(content, filename = "no-filename", content_type = nil, host="localhost", port=3333)
|
15
|
+
begin
|
16
|
+
client = new(filename, content)
|
17
|
+
client.content_type = content_type
|
18
|
+
res = Net::HTTP.start(host, port){|http| http.request(client.request) }
|
19
|
+
raise "Request failed #{res}" unless res.is_a? Net::HTTPOK
|
20
|
+
res.body
|
21
|
+
rescue Errno::ECONNREFUSED => e
|
22
|
+
raise "DeltaAttack Server is down on http://#{host}:#{port}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
alias extract_buf cast_buf
|
26
|
+
end
|
27
|
+
|
28
|
+
attr_writer :content_type
|
29
|
+
|
30
|
+
def initialize(filename, content=nil)
|
31
|
+
@filename = filename
|
32
|
+
@content = content
|
33
|
+
end
|
34
|
+
|
35
|
+
def boundary
|
36
|
+
@boundary ||= Digest::SHA1.hexdigest(File.read(__FILE__))[0,8]
|
37
|
+
end
|
38
|
+
|
39
|
+
def content
|
40
|
+
@content ||= File.open(@filename,"rb"){|f| f.read }
|
41
|
+
end
|
42
|
+
|
43
|
+
def content_type
|
44
|
+
@content_type ||= FiletypeAssumption.new(File.basename(@filename)).content_type
|
45
|
+
end
|
46
|
+
|
47
|
+
def body
|
48
|
+
data = ''
|
49
|
+
data << "--#{boundary}\r\n"
|
50
|
+
data << "Content-Disposition: form-data; name=\"file\"; filename=\"#{@filename}\"\r\n"
|
51
|
+
data << "Content-Type: #{content_type}\r\n\r\n"
|
52
|
+
data << content
|
53
|
+
data << "\r\n--#{boundary}--\r\n"
|
54
|
+
end
|
55
|
+
|
56
|
+
def request(path = "/extract" )
|
57
|
+
req = Net::HTTP::Post.new(path)
|
58
|
+
req.content_type = "multipart/form-data; boundary=#{boundary}"
|
59
|
+
req.body = body
|
60
|
+
req.content_length = req.body.size
|
61
|
+
req
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
require 'delta_attack/extractor/word'
|
3
|
+
require 'delta_attack/extractor/excel'
|
4
|
+
require 'delta_attack/extractor/power_point'
|
5
|
+
|
6
|
+
module DeltaAttack
|
7
|
+
module Extractor
|
8
|
+
Error = Class.new(RuntimeError)
|
9
|
+
def extract(content,type)
|
10
|
+
extractor = case type
|
11
|
+
when :word then Word
|
12
|
+
when :excel then Excel
|
13
|
+
when :power_point then PowerPoint
|
14
|
+
else raise Error.new("not supported")
|
15
|
+
end
|
16
|
+
|
17
|
+
extractor.new(content.to_java_bytes).data.flatten.join("\n")
|
18
|
+
end
|
19
|
+
module_function :extract
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
module DeltaAttack
|
4
|
+
module Extractor
|
5
|
+
class Base
|
6
|
+
attr_accessor :bytes
|
7
|
+
def initialize(bytes)
|
8
|
+
@bytes = bytes
|
9
|
+
end
|
10
|
+
|
11
|
+
def data(ignore_cache=false)
|
12
|
+
return @data if (!ignore_cache) && @data
|
13
|
+
|
14
|
+
@data = extract_data
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
def java_input_stream
|
19
|
+
Java::JavaIo::ByteArrayInputStream.new(@bytes)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
|
3
|
+
include_class 'org.apache.poi.hssf.usermodel.HSSFWorkbook'
|
4
|
+
include_class 'org.apache.poi.hssf.usermodel.HSSFCell'
|
5
|
+
|
6
|
+
module DeltaAttack
|
7
|
+
module Extractor
|
8
|
+
class Excel < Base
|
9
|
+
private
|
10
|
+
def extract_data
|
11
|
+
input_stream = java_input_stream
|
12
|
+
begin
|
13
|
+
book = HSSFWorkbook.new(input_stream)
|
14
|
+
return (0...book.number_of_sheets).map do |i|
|
15
|
+
extract_sheet(book.sheet_at(i))
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
input_stream.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_sheet(sheet)
|
23
|
+
sheet.iterator.map do |row|
|
24
|
+
row.iterator.map{|cell| handle_cell(cell) }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def handle_cell(cell)
|
29
|
+
case cell.cell_type
|
30
|
+
when HSSFCell::CELL_TYPE_NUMERIC
|
31
|
+
cell.numeric_cell_value
|
32
|
+
when HSSFCell::CELL_TYPE_STRING
|
33
|
+
cell.rich_string_cell_value.string
|
34
|
+
when HSSFCell::CELL_TYPE_BOOLEAN, HSSFCell::CELL_TYPE_BLANK
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
|
3
|
+
include_class 'org.apache.poi.hslf.usermodel.SlideShow'
|
4
|
+
|
5
|
+
module DeltaAttack
|
6
|
+
module Extractor
|
7
|
+
class PowerPoint < Base
|
8
|
+
private
|
9
|
+
def extract_data
|
10
|
+
input_stream = java_input_stream
|
11
|
+
begin
|
12
|
+
slide_show = SlideShow.new(input_stream)
|
13
|
+
slide_show.slides.map do |slide|
|
14
|
+
slide.text_runs.map{|tr| tr.text }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'webrick/httpservlet'
|
2
|
+
require 'delta_attack/extractor'
|
3
|
+
require 'delta_attack/filetype_assumption'
|
4
|
+
|
5
|
+
module DeltaAttack
|
6
|
+
module Extractor
|
7
|
+
class Servlet < WEBrick::HTTPServlet::AbstractServlet
|
8
|
+
def do_GET(req, res)
|
9
|
+
res.body = <<-HTML
|
10
|
+
<html>
|
11
|
+
<head></head>
|
12
|
+
<body>
|
13
|
+
<form action="/extract" enctype="multipart/form-data" method="post">
|
14
|
+
<input type="file" name="file" />
|
15
|
+
<input type="submit" name="submit" value="up" />
|
16
|
+
</form>
|
17
|
+
</body>
|
18
|
+
</html>
|
19
|
+
HTML
|
20
|
+
end
|
21
|
+
|
22
|
+
def do_POST(req, res)
|
23
|
+
f = req.query["file"]
|
24
|
+
type = FiletypeAssumption.new(f.filename, f['content-type'])
|
25
|
+
begin
|
26
|
+
res.body = Extractor.extract(f.to_s, type.filetype)
|
27
|
+
res.content_type = "text/plain"
|
28
|
+
rescue Extractor::Error => exe
|
29
|
+
raise WEBrick::HTTPStatus::BadRequest, exe.message
|
30
|
+
rescue StandardError => stdex
|
31
|
+
raise WEBrick::HTTPStatus::InternalServerError, stdex.message
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'delta_attack/extractor/base'
|
2
|
+
|
3
|
+
include_class 'org.apache.poi.hwpf.HWPFDocument'
|
4
|
+
|
5
|
+
module DeltaAttack
|
6
|
+
module Extractor
|
7
|
+
class Word < Base
|
8
|
+
|
9
|
+
private
|
10
|
+
def extract_data
|
11
|
+
input_stream = java_input_stream
|
12
|
+
begin
|
13
|
+
book = HWPFDocument.new(input_stream)
|
14
|
+
range = book.range
|
15
|
+
(0...range.num_paragraphs).map do |i|
|
16
|
+
range.paragraph(i).text.strip
|
17
|
+
end
|
18
|
+
ensure
|
19
|
+
input_stream.close
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
begin
|
2
|
+
require 'mahoro'
|
3
|
+
rescue LoadError
|
4
|
+
nil
|
5
|
+
end
|
6
|
+
|
7
|
+
module DeltaAttack
|
8
|
+
class FiletypeAssumption
|
9
|
+
CONTENT_TYPES = {
|
10
|
+
"application/msword" => :word,
|
11
|
+
"application/vnd.ms-excel" => :excel,
|
12
|
+
"application/vnd.ms-powerpoint" => :power_point,
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
def self.support_magic?
|
16
|
+
defined? Mahoro
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(filename, content_type = nil, content = nil)
|
20
|
+
@filename = filename
|
21
|
+
@content_type = content_type
|
22
|
+
@content = content
|
23
|
+
end
|
24
|
+
|
25
|
+
def filetype
|
26
|
+
by_content_type || by_extention || :unknown
|
27
|
+
end
|
28
|
+
|
29
|
+
def content_type
|
30
|
+
CONTENT_TYPES.index(filetype)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
def by_content_type
|
35
|
+
CONTENT_TYPES[@content_type]
|
36
|
+
end
|
37
|
+
|
38
|
+
def by_extention
|
39
|
+
case File.extname(@filename).downcase
|
40
|
+
when ".doc" then :word
|
41
|
+
when ".xls" then :excel
|
42
|
+
when ".ppt" then :power_point
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/vendor/README
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/excel'
|
3
|
+
require 'java'
|
4
|
+
require 'timeout'
|
5
|
+
|
6
|
+
describe DeltaAttack::Extractor::Excel do
|
7
|
+
include SpecHelper
|
8
|
+
before do
|
9
|
+
content = File.read(sample_data("13TOKYO.xls"))
|
10
|
+
@xls = DeltaAttack::Extractor::Excel.new(content.to_java_bytes)
|
11
|
+
end
|
12
|
+
|
13
|
+
it { @xls.bytes.should_not be_nil }
|
14
|
+
it "data[0][0].should == 13101" do
|
15
|
+
@xls.data[0][0][0].should == 13101
|
16
|
+
end
|
17
|
+
|
18
|
+
it "2nd call of data() should be cached" do
|
19
|
+
@xls.data # 1st.
|
20
|
+
lambda{ timeout(0.1){ @xls.data } }.should_not raise_error(Timeout::Error)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/power_point'
|
3
|
+
require 'java'
|
4
|
+
require 'timeout'
|
5
|
+
|
6
|
+
describe DeltaAttack::Extractor::PowerPoint do
|
7
|
+
include SpecHelper
|
8
|
+
before do
|
9
|
+
content = File.read(sample_data("named_scope06.ppt"))
|
10
|
+
@ppt = DeltaAttack::Extractor::PowerPoint.new(content.to_java_bytes)
|
11
|
+
end
|
12
|
+
|
13
|
+
it { @ppt.bytes.should_not be_nil }
|
14
|
+
it "data.flatten.first.should == /named_scope/" do
|
15
|
+
@ppt.data.flatten.first.should =~ /named_scope/
|
16
|
+
end
|
17
|
+
|
18
|
+
it "2nd call of data() should be cached" do
|
19
|
+
@ppt.data # 1st.
|
20
|
+
lambda{ timeout(0.1){ @ppt.data } }.should_not raise_error(Timeout::Error)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/servlet'
|
3
|
+
|
4
|
+
describe DeltaAttack::Extractor::Servlet do
|
5
|
+
before do
|
6
|
+
@servlet = DeltaAttack::Extractor::Servlet.new("hoge", {})
|
7
|
+
|
8
|
+
file = mock("upload_file")
|
9
|
+
file.should_receive(:filename).and_return("foo.xls")
|
10
|
+
file.should_receive(:[]).with("content-type").and_return("application/vnd.ms-excel")
|
11
|
+
file.should_receive(:to_s).and_return("DATA-DATA")
|
12
|
+
|
13
|
+
@req = mock("request")
|
14
|
+
@req.should_receive(:query).and_return("file"=>file)
|
15
|
+
|
16
|
+
@res = Struct.new(:body, :content_type, :status).new
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "pass" do
|
20
|
+
before do
|
21
|
+
DeltaAttack::Extractor.should_receive(:extract).with("DATA-DATA", :excel).and_return("RESPONSE")
|
22
|
+
@servlet.do_POST(@req, @res)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "@res.body.should == 'RESPONSE'" do
|
26
|
+
@res.body.should == 'RESPONSE'
|
27
|
+
end
|
28
|
+
|
29
|
+
it "@res.content_type.should == 'text/plain'" do
|
30
|
+
@res.body.should == 'RESPONSE'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "fail with unsupported type" do
|
35
|
+
before do
|
36
|
+
DeltaAttack::Extractor.should_receive(:extract).and_raise(DeltaAttack::Extractor::Error)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
|
40
|
+
lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::BadRequest)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "fail with something" do
|
45
|
+
before do
|
46
|
+
DeltaAttack::Extractor.should_receive(:extract).and_raise(StandardError)
|
47
|
+
end
|
48
|
+
|
49
|
+
it "do_POST.should raise_error(WEBrick::HTTPStatus::BadRequest)" do
|
50
|
+
lambda{ @servlet.do_POST(@req, @res) }.should raise_error(WEBrick::HTTPStatus::InternalServerError)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path("../spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor/word'
|
3
|
+
require 'java'
|
4
|
+
require 'timeout'
|
5
|
+
$KCODE = "u"
|
6
|
+
|
7
|
+
describe DeltaAttack::Extractor::Word do
|
8
|
+
include SpecHelper
|
9
|
+
before do
|
10
|
+
content = File.read(sample_data("myblog.doc"))
|
11
|
+
@doc = DeltaAttack::Extractor::Word.new(content.to_java_bytes)
|
12
|
+
end
|
13
|
+
|
14
|
+
it { @doc.bytes.should_not be_nil }
|
15
|
+
it "data.flatten.first.should =~ /WEBrick/" do
|
16
|
+
@doc.data.flatten.first.should =~ /WEBrick/
|
17
|
+
end
|
18
|
+
|
19
|
+
it "2nd call of data() should be cached" do
|
20
|
+
@doc.data # 1st.
|
21
|
+
lambda{ timeout(0.1){ @doc.data } }.should_not raise_error(Timeout::Error)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.expand_path("spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/extractor'
|
3
|
+
|
4
|
+
describe DeltaAttack::Extractor, ".extract" do
|
5
|
+
it "(nil, :unknown).should raise_error(DeltaAttack::Extractor::Error)" do
|
6
|
+
lambda{
|
7
|
+
DeltaAttack::Extractor.extract(nil, :unknown)
|
8
|
+
}.should raise_error(DeltaAttack::Extractor::Error)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "(mock, :word)" do
|
12
|
+
before do
|
13
|
+
@content = mock("content")
|
14
|
+
@content.should_receive(:to_java_bytes).and_return(%w(a b c))
|
15
|
+
|
16
|
+
extractor = mock("extractor")
|
17
|
+
extractor.should_receive(:data).and_return(%w(a b c))
|
18
|
+
DeltaAttack::Extractor::Word.should_receive(:new).with(%w(a b c)).and_return(extractor)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should == "a\nb\nc"' do
|
22
|
+
DeltaAttack::Extractor.extract(@content, :word)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require File.expand_path("spec_helper", File.dirname(__FILE__))
|
2
|
+
require 'delta_attack/filetype_assumption'
|
3
|
+
|
4
|
+
describe DeltaAttack::FiletypeAssumption do
|
5
|
+
include SpecHelper
|
6
|
+
it "should not support_magic" do
|
7
|
+
DeltaAttack::FiletypeAssumption.should_not be_support_magic
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "new('hoge.xls')" do
|
11
|
+
before do
|
12
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.xls')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "filetype.should == :excel" do
|
16
|
+
@asm.filetype.should == :excel
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "new('hoge.dat', 'application/vnd.ms-excel')" do
|
21
|
+
before do
|
22
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/vnd.ms-excel')
|
23
|
+
end
|
24
|
+
|
25
|
+
it "filetype.should == :excel" do
|
26
|
+
@asm.filetype.should == :excel
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "new('hoge.dat', 'application/octet-stream')" do
|
31
|
+
before do
|
32
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream')
|
33
|
+
end
|
34
|
+
|
35
|
+
it "filetype.should == :unknown" do
|
36
|
+
@asm.filetype.should == :unknown
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "new('hoge.dat', 'application/octet-stream', <content>)" do
|
41
|
+
before do
|
42
|
+
content = File.read(sample_data("13TOKYO.xls"))
|
43
|
+
@asm = DeltaAttack::FiletypeAssumption.new('hoge.dat', 'application/octet-stream', content)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "filetype.should == :excel" do
|
47
|
+
pending "mahoro is not installed" unless DeltaAttack::FiletypeAssumption.support_magic?
|
48
|
+
@asm.filetype.should == :excel
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# vim:set fileencoding=utf-8 filetype=ruby
|
3
|
+
$KCODE = 'u'
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
$:.unshift(File.expand_path("../lib", File.dirname(__FILE__)))
|
7
|
+
|
8
|
+
module SpecHelper
|
9
|
+
def sample_data(name)
|
10
|
+
File.expand_path("../samples/data/" + name, File.dirname(__FILE__))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: delta_attack
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 4
|
10
|
+
version: 0.1.4
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- MOROHASHI Kyosuke
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2008-09-30 00:00:00 -07:00
|
19
|
+
default_executable: delta_attack_server
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: extract text from MS Office document with Apache POI
|
23
|
+
email: moronatural@gmail.com
|
24
|
+
executables:
|
25
|
+
- delta_attack_server
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
- ChangeLog
|
31
|
+
files:
|
32
|
+
- README
|
33
|
+
- NOTICE
|
34
|
+
- ChangeLog
|
35
|
+
- Rakefile
|
36
|
+
- bin/delta_attack_server
|
37
|
+
- spec/extractor/excel_spec.rb
|
38
|
+
- spec/extractor/power_point_spec.rb
|
39
|
+
- spec/extractor/servlet_spec.rb
|
40
|
+
- spec/extractor/word_spec.rb
|
41
|
+
- spec/extractor_spec.rb
|
42
|
+
- spec/filetype_assumption_spec.rb
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
- lib/delta_attack/client.rb
|
45
|
+
- lib/delta_attack/extractor/base.rb
|
46
|
+
- lib/delta_attack/extractor/excel.rb
|
47
|
+
- lib/delta_attack/extractor/power_point.rb
|
48
|
+
- lib/delta_attack/extractor/servlet.rb
|
49
|
+
- lib/delta_attack/extractor/word.rb
|
50
|
+
- lib/delta_attack/extractor.rb
|
51
|
+
- lib/delta_attack/filetype_assumption.rb
|
52
|
+
- lib/delta_attack.rb
|
53
|
+
- lib/vendor/README
|
54
|
+
has_rdoc: true
|
55
|
+
homepage: http://github.com/moro/delta_attack
|
56
|
+
licenses: []
|
57
|
+
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options:
|
60
|
+
- --title
|
61
|
+
- delta_attack documentation
|
62
|
+
- --charset
|
63
|
+
- utf-8
|
64
|
+
- --opname
|
65
|
+
- index.html
|
66
|
+
- --line-numbers
|
67
|
+
- --main
|
68
|
+
- README
|
69
|
+
- --inline-source
|
70
|
+
- --exclude
|
71
|
+
- ^(examples|extras)/
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
none: false
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
hash: 3
|
80
|
+
segments:
|
81
|
+
- 0
|
82
|
+
version: "0"
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
hash: 3
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
version: "0"
|
92
|
+
requirements: []
|
93
|
+
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.3.7
|
96
|
+
signing_key:
|
97
|
+
specification_version: 2
|
98
|
+
summary: extract text from MS Office document with Apache POI
|
99
|
+
test_files: []
|
100
|
+
|