iudex-barc 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-04-04)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,10 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ pom.xml
6
+ bin/iudex-barc
7
+ bin/iudex-http-record
8
+ lib/iudex-barc/base.rb
9
+ lib/iudex-barc.rb
10
+ lib/iudex-barc/iudex-barc-1.0.0.jar
data/README.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = iudex-barc
2
+
3
+ * http://github.com/dekellum/iudex
4
+
5
+ == Description
6
+
7
+ Iudex is a general purpose web crawler and feed processor in
8
+ ruby/java. The iudex-barc gem contains support for the BARC Basic
9
+ ARChive format.
10
+
11
+ == License
12
+
13
+ Copyright (c) 2008-2011 David Kellum
14
+
15
+ Licensed under the Apache License, Version 2.0 (the "License"); you
16
+ may not use this file except in compliance with the License. You
17
+ may obtain a copy of the License at:
18
+
19
+ http://www.apache.org/licenses/LICENSE-2.0
20
+
21
+ Unless required by applicable law or agreed to in writing, software
22
+ distributed under the License is distributed on an "AS IS" BASIS,
23
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24
+ implied. See the License for the specific language governing
25
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,40 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+ require 'iudex-barc/base'
5
+
6
+ require 'rubygems'
7
+ gem 'rjack-tarpit', '~> 1.2'
8
+ require 'rjack-tarpit'
9
+
10
+ t = RJack::TarPit.new( 'iudex-barc',
11
+ Iudex::BARC::VERSION,
12
+ :no_assembly, :java_platform )
13
+
14
+ t.specify do |h|
15
+ h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
+ h.extra_deps += [ [ 'rjack-slf4j', '~> 1.6.1' ],
17
+ [ 'gravitext-util', '~> 1.5.0' ],
18
+ [ 'iudex-http', '~> 1.0.0' ] ]
19
+
20
+ h.testlib = :minitest
21
+ h.extra_dev_deps << [ 'minitest', '>= 1.7.1', '< 2.1' ]
22
+ end
23
+
24
+ file 'Manifest.txt' => "lib/#{t.name}/base.rb"
25
+
26
+ task :check_pom_version do
27
+ t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
28
+ end
29
+ task :check_history_version do
30
+ t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
31
+ end
32
+ task :check_history_date do
33
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
34
+ end
35
+
36
+ task :gem => [ :check_pom_version, :check_history_version ]
37
+ task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
38
+ task :push => [ :check_history_date ]
39
+
40
+ t.define_tasks
data/bin/iudex-barc ADDED
@@ -0,0 +1,176 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+
23
+ require 'optparse'
24
+
25
+ module IudexBinScript
26
+
27
+ require 'rjack-logback'
28
+ include RJack
29
+
30
+ Logback.config_console( :level => Logback::INFO, :stderr => true )
31
+
32
+ require 'iudex-barc'
33
+
34
+ class BARCTool
35
+ include Iudex
36
+ import 'com.gravitext.util.Streams'
37
+
38
+ def initialize
39
+ @offset = 0
40
+ @sections = ALL_SECTIONS
41
+ @show_replaced = false
42
+ end
43
+
44
+ COMMANDS = [ :show ]
45
+ ALL_SECTIONS = [ :meta, :request, :response, :body ]
46
+
47
+ HEADER_TITLES = { :meta => "META",
48
+ :request => "RQST",
49
+ :response => "RESP" }
50
+
51
+ def run( args = ARGV )
52
+ command = parse_args( args )
53
+ self.send( command, args )
54
+ end
55
+
56
+ def parse_args( args = ARGV )
57
+ osecs = []
58
+
59
+ parser = OptionParser.new do |opts|
60
+ opts.banner = ( "Usage: iudex-barc [options] {show} BARCFile...\n" +
61
+ "Options:\n" )
62
+
63
+ opts.on( "-v", "--version", "Display version and exit" ) do |file|
64
+ puts "iudex-barc: #{BARC::VERSION}"
65
+ exit 1
66
+ end
67
+
68
+ opts.on( "-o", "--offset N", String,
69
+ "Offset into (first) BARCFile (one record)" ) do |offset|
70
+ @offset = Integer( offset )
71
+ end
72
+
73
+ { :meta => 'm', :request => 'q', :response => 'r',
74
+ :body => 'b' }.each do |sec,c|
75
+ opts.on( '-' + c,
76
+ '--' + sec.to_s,
77
+ "Show #{sec} #{ 'headers' unless sec == :body }" ) do
78
+ osecs << sec
79
+ end
80
+ end
81
+
82
+ opts.on( "-x", "--show-replaced",
83
+ "Show replaced records as well." ) do
84
+ @show_replaced = true
85
+ end
86
+
87
+ opts.on_tail( "-h", "--help", "Show help and exit" ) do
88
+ opts.usage
89
+ end
90
+ end
91
+
92
+ def parser.usage
93
+ puts self
94
+ puts
95
+ puts( "Commands:")
96
+ puts( " show: Dump BARC record details." )
97
+ exit 1
98
+ end
99
+
100
+ parser.parse!( args )
101
+
102
+ @sections = osecs unless osecs.empty?
103
+
104
+ bsec, @h_sections = @sections.partition { |s| s == :body }
105
+ @body_section = ! bsec.empty?
106
+
107
+ command = args.shift
108
+
109
+ parser.usage unless command
110
+ command = command.to_sym
111
+
112
+ parser.usage unless COMMANDS.include?( command )
113
+ command
114
+ end
115
+
116
+ def show( barcs )
117
+ if @offset != 0
118
+ bfile = barc_open( barcs.first )
119
+ display( bfile.read( @offset ) )
120
+ else
121
+ barcs.each do |bname|
122
+ bfile = barc_open( bname )
123
+ rr = bfile.reader
124
+ while( rec = rr.next )
125
+ if @show_replaced || rec.type.chr != 'R'
126
+ display( rec, ( bname if barcs.length > 1 ) )
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def display( rec, meta = nil )
134
+ puts( "-BARC1 %c%s : 0x%x %s" %
135
+ [ rec.type,
136
+ rec.compressed? ? 'C':'P',
137
+ rec.offset,
138
+ meta ] )
139
+
140
+ @h_sections.each do |ht|
141
+ display_headers( ht, rec.send( "#{ht}_headers" ) )
142
+ end
143
+
144
+ dump_body( rec ) if @body_section
145
+ end
146
+
147
+ def display_headers( htype, headers )
148
+ unless headers.empty?
149
+ puts "=#{ HEADER_TITLES[ htype ] }="
150
+ headers.each { |h| puts h }
151
+ puts
152
+ end
153
+ end
154
+
155
+ def dump_body( rec )
156
+ body_in = rec.body_input_stream
157
+ if body_in && ( c = body_in.read ) != -1
158
+ out = Java::java.lang.System::out
159
+ puts "=BODY="
160
+ out.write( c )
161
+ Streams::copy( body_in, out )
162
+ puts
163
+ end
164
+ end
165
+
166
+ def barc_open( bname )
167
+ #FIXME: Avoid opening new barc if not existing
168
+ raise "BARC File '#{bname}' not found" unless File.exist?( bname )
169
+ BARC::BARCFile.new( Java::java.io.File.new( bname ) )
170
+ end
171
+ end
172
+
173
+ bt = BARCTool.new
174
+ bt.run
175
+
176
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2008-2011 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You
8
+ # may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'rjack-logback'
23
+ RJack::Logback.config_console
24
+
25
+ #RJack::Logback[ "org.apache.commons.httpclient" ].level = RJack::Logback::INFO
26
+ #RJack::Logback[ "iudex" ].level = RJack::Logback::DEBUG
27
+
28
+ require 'iudex-barc'
29
+ require 'iudex-httpclient-3' #FIXME: Unspecified dependency
30
+
31
+ import 'iudex.httpclient3.HTTPClient3'
32
+ import 'iudex.barc.http.BARCResponseHandler'
33
+ import 'iudex.barc.BARCFile'
34
+
35
+ hmanager = RJack::HTTPClient3::ManagerFacade.new
36
+ hmanager.start
37
+
38
+ hclient = HTTPClient3.new( hmanager.client )
39
+
40
+ barc_file = BARCFile.new( java.io.File.new( './record.barc' ) ) #FIXME: param
41
+ barc_file.truncate #FIXME: Optional
42
+
43
+ handler = BARCResponseHandler.new( barc_file )
44
+ handler.do_compress = false #FIXME: Option
45
+
46
+ hsession = hclient.createSession;
47
+ hsession.url = 'http://gravitext.com/blog' #FIXME: param
48
+
49
+ hclient.request( hsession, handler )
50
+
51
+ hmanager.shutdown
@@ -0,0 +1,23 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module BARC
19
+ VERSION = '1.0.0'
20
+
21
+ LIB_DIR = File.dirname( __FILE__ ) # :nodoc:
22
+ end
23
+ end
Binary file
data/lib/iudex-barc.rb ADDED
@@ -0,0 +1,33 @@
1
+ #--
2
+ # Copyright (c) 2008-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'rjack-slf4j'
18
+ require 'gravitext-util'
19
+
20
+ require 'iudex-http'
21
+
22
+ require 'iudex-barc/base'
23
+
24
+ require 'java'
25
+
26
+ module Iudex
27
+ module BARC
28
+ require "#{LIB_DIR}/iudex-barc-#{VERSION}.jar"
29
+
30
+ import 'iudex.barc.BARCDirectory'
31
+ import 'iudex.barc.BARCFile'
32
+ end
33
+ end
data/pom.xml ADDED
@@ -0,0 +1,62 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3
+
4
+ <modelVersion>4.0.0</modelVersion>
5
+ <groupId>iudex</groupId>
6
+ <artifactId>iudex-barc</artifactId>
7
+ <packaging>jar</packaging>
8
+ <version>1.0.0</version>
9
+ <name>Iudex Basic ARChive Format</name>
10
+
11
+ <parent>
12
+ <groupId>iudex</groupId>
13
+ <artifactId>iudex-parent</artifactId>
14
+ <version>1.0</version>
15
+ <relativePath>..</relativePath>
16
+ </parent>
17
+
18
+ <dependencies>
19
+
20
+ <dependency>
21
+ <groupId>org.slf4j</groupId>
22
+ <artifactId>slf4j-api</artifactId>
23
+ </dependency>
24
+
25
+ <dependency>
26
+ <groupId>com.gravitext</groupId>
27
+ <artifactId>gravitext-util</artifactId>
28
+ </dependency>
29
+
30
+ <dependency>
31
+ <groupId>iudex</groupId>
32
+ <artifactId>iudex-http</artifactId>
33
+ <version>[1.0,1.1)</version>
34
+ </dependency>
35
+
36
+ <dependency>
37
+ <groupId>junit</groupId>
38
+ <artifactId>junit</artifactId>
39
+ </dependency>
40
+
41
+ <dependency>
42
+ <groupId>ch.qos.logback</groupId>
43
+ <artifactId>logback-classic</artifactId>
44
+ <scope>test</scope>
45
+ </dependency>
46
+
47
+ </dependencies>
48
+
49
+ <build>
50
+ <plugins>
51
+ <plugin>
52
+ <!-- Parent settings -->
53
+ <artifactId>maven-compiler-plugin</artifactId>
54
+ </plugin>
55
+ <plugin>
56
+ <!-- Parent settings -->
57
+ <artifactId>maven-source-plugin</artifactId>
58
+ </plugin>
59
+ </plugins>
60
+ </build>
61
+
62
+ </project>
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-barc
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: rjack-slf4j
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.6.1
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: gravitext-util
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 1.5.0
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: iudex-http
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ~>
45
+ - !ruby/object:Gem::Version
46
+ version: 1.0.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: minitest
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: 1.7.1
58
+ - - <
59
+ - !ruby/object:Gem::Version
60
+ version: "2.1"
61
+ type: :development
62
+ version_requirements: *id004
63
+ - !ruby/object:Gem::Dependency
64
+ name: rjack-tarpit
65
+ prerelease: false
66
+ requirement: &id005 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ~>
70
+ - !ruby/object:Gem::Version
71
+ version: 1.3.0
72
+ type: :development
73
+ version_requirements: *id005
74
+ description: |-
75
+ Iudex is a general purpose web crawler and feed processor in
76
+ ruby/java. The iudex-barc gem contains support for the BARC Basic
77
+ ARChive format.
78
+ email:
79
+ - dek-oss@gravitext.com
80
+ executables:
81
+ - iudex-barc
82
+ - iudex-http-record
83
+ extensions: []
84
+
85
+ extra_rdoc_files:
86
+ - Manifest.txt
87
+ - History.rdoc
88
+ - README.rdoc
89
+ files:
90
+ - History.rdoc
91
+ - Manifest.txt
92
+ - README.rdoc
93
+ - Rakefile
94
+ - pom.xml
95
+ - bin/iudex-barc
96
+ - bin/iudex-http-record
97
+ - lib/iudex-barc/base.rb
98
+ - lib/iudex-barc.rb
99
+ - lib/iudex-barc/iudex-barc-1.0.0.jar
100
+ has_rdoc: true
101
+ homepage: http://github.com/dekellum/iudex
102
+ licenses: []
103
+
104
+ post_install_message:
105
+ rdoc_options:
106
+ - --main
107
+ - README.rdoc
108
+ require_paths:
109
+ - lib
110
+ required_ruby_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: "0"
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ none: false
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: "0"
122
+ requirements: []
123
+
124
+ rubyforge_project: iudex-barc
125
+ rubygems_version: 1.5.1
126
+ signing_key:
127
+ specification_version: 3
128
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
129
+ test_files: []
130
+