iudex-barc 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,10 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ bin/iudex-barc
7
+ bin/iudex-http-record
8
+ lib/iudex-barc/base.rb
9
+ lib/iudex-barc.rb
10
+ lib/iudex-barc/iudex-barc-1.0.0.jar
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = iudex-barc
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-barc gem contains support for the BARC Basic
9
+ ARChive format.
10
+
11
+ == License
12
+
13
+ Copyright (c) 2008-2011 David Kellum
14
+
15
+ Licensed under the Apache License, Version 2.0 (the "License"); you
16
+ may not use this file except in compliance with the License. You
17
+ may obtain a copy of the License at:
18
+
19
+ http://www.apache.org/licenses/LICENSE-2.0
20
+
21
+ Unless required by applicable law or agreed to in writing, software
22
+ distributed under the License is distributed on an "AS IS" BASIS,
23
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24
+ implied. See the License for the specific language governing
25
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,40 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-barc/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-barc',
11
+ Iudex::BARC::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+ h.extra_deps += [ [ 'rjack-slf4j', '~> 1.6.1' ],
17
+ [ 'gravitext-util', '~> 1.5.0' ],
18
+ [ 'iudex-http', '~> 1.0.0' ] ]
19
+
20
+ h.testlib = :minitest
21
+ h.extra_dev_deps << [ 'minitest', '>= 1.7.1', '< 2.1' ]
22
+ end
23
+
24
+ file 'Manifest.txt' => "lib/#{t.name}/base.rb"
25
+
26
+ task :check_pom_version do
27
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
28
+ end
29
+ task :check_history_version do
30
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
31
+ end
32
+ task :check_history_date do
33
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
34
+ end
35
+
36
+ task :gem => [ :check_pom_version, :check_history_version ]
37
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
38
+ task :push => [ :check_history_date ]
39
+
40
+ t.define_tasks
data/bin/iudex-barc ADDED
@@ -0,0 +1,176 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+
23
+ require 'optparse'
24
+
25
+ module IudexBinScript
26
+
27
+ require 'rjack-logback'
28
+ include RJack
29
+
30
+ Logback.config_console( :level => Logback::INFO, :stderr => true )
31
+
32
+ require 'iudex-barc'
33
+
34
+ class BARCTool
35
+ include Iudex
36
+ import 'com.gravitext.util.Streams'
37
+
38
+ def initialize
39
+ @offset = 0
40
+ @sections = ALL_SECTIONS
41
+ @show_replaced = false
42
+ end
43
+
44
+ COMMANDS = [ :show ]
45
+ ALL_SECTIONS = [ :meta, :request, :response, :body ]
46
+
47
+ HEADER_TITLES = { :meta => "META",
48
+ :request => "RQST",
49
+ :response => "RESP" }
50
+
51
+ def run( args = ARGV )
52
+ command = parse_args( args )
53
+ self.send( command, args )
54
+ end
55
+
56
+ def parse_args( args = ARGV )
57
+ osecs = []
58
+
59
+ parser = OptionParser.new do |opts|
60
+ opts.banner = ( "Usage: iudex-barc [options] {show} BARCFile...\n" +
61
+ "Options:\n" )
62
+
63
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
64
+ puts "iudex-barc: #{BARC::VERSION}"
65
+ exit 1
66
+ end
67
+
68
+ opts.on( "-o", "--offset N", String,
69
+ "Offset into (first) BARCFile (one record)" ) do |offset|
70
+ @offset = Integer( offset )
71
+ end
72
+
73
+ { :meta => 'm', :request => 'q', :response => 'r',
74
+ :body => 'b' }.each do |sec,c|
75
+ opts.on( '-' + c,
76
+ '--' + sec.to_s,
77
+ "Show #{sec} #{ 'headers' unless sec == :body }" ) do
78
+ osecs << sec
79
+ end
80
+ end
81
+
82
+ opts.on( "-x", "--show-replaced",
83
+ "Show replaced records as well." ) do
84
+ @show_replaced = true
85
+ end
86
+
87
+ opts.on_tail( "-h", "--help", "Show help and exit" ) do
88
+ opts.usage
89
+ end
90
+ end
91
+
92
+ def parser.usage
93
+ puts self
94
+ puts
95
+ puts( "Commands:")
96
+ puts( " show: Dump BARC record details." )
97
+ exit 1
98
+ end
99
+
100
+ parser.parse!( args )
101
+
102
+ @sections = osecs unless osecs.empty?
103
+
104
+ bsec, @h_sections = @sections.partition { |s| s == :body }
105
+ @body_section = ! bsec.empty?
106
+
107
+ command = args.shift
108
+
109
+ parser.usage unless command
110
+ command = command.to_sym
111
+
112
+ parser.usage unless COMMANDS.include?( command )
113
+ command
114
+ end
115
+
116
+ def show( barcs )
117
+ if @offset != 0
118
+ bfile = barc_open( barcs.first )
119
+ display( bfile.read( @offset ) )
120
+ else
121
+ barcs.each do |bname|
122
+ bfile = barc_open( bname )
123
+ rr = bfile.reader
124
+ while( rec = rr.next )
125
+ if @show_replaced || rec.type.chr != 'R'
126
+ display( rec, ( bname if barcs.length > 1 ) )
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def display( rec, meta = nil )
134
+ puts( "-BARC1 %c%s : 0x%x %s" %
135
+ [ rec.type,
136
+ rec.compressed? ? 'C':'P',
137
+ rec.offset,
138
+ meta ] )
139
+
140
+ @h_sections.each do |ht|
141
+ display_headers( ht, rec.send( "#{ht}_headers" ) )
142
+ end
143
+
144
+ dump_body( rec ) if @body_section
145
+ end
146
+
147
+ def display_headers( htype, headers )
148
+ unless headers.empty?
149
+ puts "=#{ HEADER_TITLES[ htype ] }="
150
+ headers.each { |h| puts h }
151
+ puts
152
+ end
153
+ end
154
+
155
+ def dump_body( rec )
156
+ body_in = rec.body_input_stream
157
+ if body_in && ( c = body_in.read ) != -1
158
+ out = Java::java.lang.System::out
159
+ puts "=BODY="
160
+ out.write( c )
161
+ Streams::copy( body_in, out )
162
+ puts
163
+ end
164
+ end
165
+
166
+ def barc_open( bname )
167
+ #FIXME: Avoid opening new barc if not existing
168
+ raise "BARC File '#{bname}' not found" unless File.exist?( bname )
169
+ BARC::BARCFile.new( Java::java.io.File.new( bname ) )
170
+ end
171
+ end
172
+
173
+ bt = BARCTool.new
174
+ bt.run
175
+
176
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'rjack-logback'
23
+ RJack::Logback.config_console
24
+
25
+ #RJack::Logback[ "org.apache.commons.httpclient" ].level = RJack::Logback::INFO
26
+ #RJack::Logback[ "iudex" ].level = RJack::Logback::DEBUG
27
+
28
+ require 'iudex-barc'
29
+ require 'iudex-httpclient-3' #FIXME: Unspecified dependency
30
+
31
+ import 'iudex.httpclient3.HTTPClient3'
32
+ import 'iudex.barc.http.BARCResponseHandler'
33
+ import 'iudex.barc.BARCFile'
34
+
35
+ hmanager = RJack::HTTPClient3::ManagerFacade.new
36
+ hmanager.start
37
+
38
+ hclient = HTTPClient3.new( hmanager.client )
39
+
40
+ barc_file = BARCFile.new( java.io.File.new( './record.barc' ) ) #FIXME: param
41
+ barc_file.truncate #FIXME: Optional
42
+
43
+ handler = BARCResponseHandler.new( barc_file )
44
+ handler.do_compress = false #FIXME: Option
45
+
46
+ hsession = hclient.createSession;
47
+ hsession.url = 'http://gravitext.com/blog' #FIXME: param
48
+
49
+ hclient.request( hsession, handler )
50
+
51
+ hmanager.shutdown
@@ -0,0 +1,23 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module BARC
19
+ VERSION = '1.0.0'
20
+
21
+ LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
+ end
23
+ end
Binary file
data/lib/iudex-barc.rb ADDED
@@ -0,0 +1,33 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'gravitext-util'
19
+
20
+ require 'iudex-http'
21
+
22
+ require 'iudex-barc/base'
23
+
24
+ require 'java'
25
+
26
+ module Iudex
27
+ module BARC
28
+ require "#{LIB_DIR}/iudex-barc-#{VERSION}.jar"
29
+
30
+ import 'iudex.barc.BARCDirectory'
31
+ import 'iudex.barc.BARCFile'
32
+ end
33
+ end
data/pom.xml ADDED
@@ -0,0 +1,62 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
+
4
+ <modelVersion>4.0.0</modelVersion>
5
+ <groupId>iudex</groupId>
6
+ <artifactId>iudex-barc</artifactId>
7
+ <packaging>jar</packaging>
8
+ <version>1.0.0</version>
9
+ <name>Iudex Basic ARChive Format</name>
10
+
11
+ <parent>
12
+ <groupId>iudex</groupId>
13
+ <artifactId>iudex-parent</artifactId>
14
+ <version>1.0</version>
15
+ <relativePath>..</relativePath>
16
+ </parent>
17
+
18
+ <dependencies>
19
+
20
+ <dependency>
21
+ <groupId>org.slf4j</groupId>
22
+ <artifactId>slf4j-api</artifactId>
23
+ </dependency>
24
+
25
+ <dependency>
26
+ <groupId>com.gravitext</groupId>
27
+ <artifactId>gravitext-util</artifactId>
28
+ </dependency>
29
+
30
+ <dependency>
31
+ <groupId>iudex</groupId>
32
+ <artifactId>iudex-http</artifactId>
33
+ <version>[1.0,1.1)</version>
34
+ </dependency>
35
+
36
+ <dependency>
37
+ <groupId>junit</groupId>
38
+ <artifactId>junit</artifactId>
39
+ </dependency>
40
+
41
+ <dependency>
42
+ <groupId>ch.qos.logback</groupId>
43
+ <artifactId>logback-classic</artifactId>
44
+ <scope>test</scope>
45
+ </dependency>
46
+
47
+ </dependencies>
48
+
49
+ <build>
50
+ <plugins>
51
+ <plugin>
52
+ <!-- Parent settings -->
53
+ <artifactId>maven-compiler-plugin</artifactId>
54
+ </plugin>
55
+ <plugin>
56
+ <!-- Parent settings -->
57
+ <artifactId>maven-source-plugin</artifactId>
58
+ </plugin>
59
+ </plugins>
60
+ </build>
61
+
62
+ </project>
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-barc
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: rjack-slf4j
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.6.1
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: gravitext-util
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 1.5.0
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: iudex-http
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ~>
45
+ - !ruby/object:Gem::Version
46
+ version: 1.0.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: minitest
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: 1.7.1
58
+ - - <
59
+ - !ruby/object:Gem::Version
60
+ version: "2.1"
61
+ type: :development
62
+ version_requirements: *id004
63
+ - !ruby/object:Gem::Dependency
64
+ name: rjack-tarpit
65
+ prerelease: false
66
+ requirement: &id005 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ~>
70
+ - !ruby/object:Gem::Version
71
+ version: 1.3.0
72
+ type: :development
73
+ version_requirements: *id005
74
+ description: |-
75
+ Iudex is a general purpose web crawler and feed processor in
76
+ ruby/java. The iudex-barc gem contains support for the BARC Basic
77
+ ARChive format.
78
+ email:
79
+ - dek-oss@gravitext.com
80
+ executables:
81
+ - iudex-barc
82
+ - iudex-http-record
83
+ extensions: []
84
+
85
+ extra_rdoc_files:
86
+ - Manifest.txt
87
+ - History.rdoc
88
+ - README.rdoc
89
+ files:
90
+ - History.rdoc
91
+ - Manifest.txt
92
+ - README.rdoc
93
+ - Rakefile
94
+ - pom.xml
95
+ - bin/iudex-barc
96
+ - bin/iudex-http-record
97
+ - lib/iudex-barc/base.rb
98
+ - lib/iudex-barc.rb
99
+ - lib/iudex-barc/iudex-barc-1.0.0.jar
100
+ has_rdoc: true
101
+ homepage: http://github.com/dekellum/iudex
102
+ licenses: []
103
+
104
+ post_install_message:
105
+ rdoc_options:
106
+ - --main
107
+ - README.rdoc
108
+ require_paths:
109
+ - lib
110
+ required_ruby_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: "0"
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ none: false
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: "0"
122
+ requirements: []
123
+
124
+ rubyforge_project: iudex-barc
125
+ rubygems_version: 1.5.1
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
129
+ test_files: []
130
+