dmoz_sax_doc 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -0
- data/Gemfile.lock +28 -0
- data/README.md +46 -0
- data/Rakefile +3 -0
- data/dmoz_sax_doc.gemspec +24 -0
- data/lib/dmoz_sax/alias.rb +9 -0
- data/lib/dmoz_sax/content_document.rb +65 -0
- data/lib/dmoz_sax/external_page.rb +11 -0
- data/lib/dmoz_sax/name_parser.rb +8 -0
- data/lib/dmoz_sax/path.rb +31 -0
- data/lib/dmoz_sax/structure_document.rb +65 -0
- data/lib/dmoz_sax/time_parser.rb +8 -0
- data/lib/dmoz_sax/topic.rb +11 -0
- data/lib/dmoz_sax/version.rb +3 -0
- data/lib/dmoz_sax.rb +13 -0
- data/spec/alias_spec.rb +6 -0
- data/spec/content_document_spec.rb +21 -0
- data/spec/external_page_spec.rb +5 -0
- data/spec/path_spec.rb +40 -0
- data/spec/samples/content_sample.rdf.u8 +41 -0
- data/spec/samples/structure_sample.rdf.u8 +48 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/structure_document_spec.rb +31 -0
- data/spec/topic_spec.rb +5 -0
- data/spec/version_spec.rb +7 -0
- metadata +129 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
dmoz_sax_doc (0.0.2)
|
5
|
+
json (~> 1.7)
|
6
|
+
nokogiri (~> 1.5)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
diff-lcs (1.2.1)
|
12
|
+
json (1.7.7)
|
13
|
+
nokogiri (1.5.6)
|
14
|
+
rspec (2.13.0)
|
15
|
+
rspec-core (~> 2.13.0)
|
16
|
+
rspec-expectations (~> 2.13.0)
|
17
|
+
rspec-mocks (~> 2.13.0)
|
18
|
+
rspec-core (2.13.0)
|
19
|
+
rspec-expectations (2.13.0)
|
20
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
21
|
+
rspec-mocks (2.13.0)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
dmoz_sax_doc!
|
28
|
+
rspec (~> 2.12)
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# DMOZ SAX Documents
|
2
|
+
|
3
|
+
This gem provides a StructureDocument and ContentDocument which are subclasses of the Nokogiri::XML::SAX::Document class and are intended to enable parse the content.rdf.u8 and structure.rdf.u8 files available on the dmoz.org page. This gem is not affiliated with the DMOZ project.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'dmoz_sax_doc'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install dmoz_sax_doc
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Open Directory License
|
24
|
+
|
25
|
+
This gem makes use of snippets of the DMOZ RDF content available from dmoz.org as test files. DMOZ open directory project is [licensed](http://www.dmoz.org/license.html) under [Creative Commons Attribution 3.0 Unported](http://creativecommons.org/licenses/by/3.0/).
|
26
|
+
|
27
|
+
<table>
|
28
|
+
<tr align="center">
|
29
|
+
<td>Help build the largest human-edited directory on the web.</td>
|
30
|
+
</tr>
|
31
|
+
<tr align="center">
|
32
|
+
<td>
|
33
|
+
<a href="/cgi-bin/add.cgi?where=Top">Submit a Site</a> -
|
34
|
+
<a href="/about.html"><b>Open Directory Project</b></a> -
|
35
|
+
<a href="/cgi-bin/apply.cgi?where=Top">Become an Editor</a>
|
36
|
+
</td></tr>
|
37
|
+
</table>
|
38
|
+
|
39
|
+
|
40
|
+
## Contributing
|
41
|
+
|
42
|
+
1. Fork it
|
43
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
44
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
45
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
46
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'dmoz_sax/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "dmoz_sax_doc"
|
8
|
+
gem.version = DmozSax::VERSION
|
9
|
+
gem.authors = ["Galen Palmer"]
|
10
|
+
gem.email = ["palmergs@gmail.com"]
|
11
|
+
gem.description = %q{Use a SAX parser to visit either the structure.u8 or content.u8 DMOZ files.}
|
12
|
+
gem.summary = %q{SAX visitor for DMOZ structure of content files.}
|
13
|
+
gem.homepage = "https://github.com/palmergs/dmoz_sax_doc"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_dependency 'nokogiri', '~> 1.5'
|
21
|
+
gem.add_dependency 'json', '~> 1.7'
|
22
|
+
|
23
|
+
gem.add_development_dependency 'rspec', '~> 2.12'
|
24
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module DmozSax
|
4
|
+
class ContentDocument < Nokogiri::XML::SAX::Document
|
5
|
+
|
6
|
+
attr_accessor :on_topic, :on_external_page
|
7
|
+
attr_accessor :name_parser, :time_parser
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
|
12
|
+
@name_parser = NameParser.new
|
13
|
+
@time_parser = TimeParser.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def characters string
|
17
|
+
@buffer ||= ""
|
18
|
+
@buffer << string
|
19
|
+
end
|
20
|
+
|
21
|
+
def start_element name, attributes = []
|
22
|
+
@buffer = ""
|
23
|
+
@name = name
|
24
|
+
|
25
|
+
case name
|
26
|
+
when 'Topic'
|
27
|
+
@topic = DmozSax::Topic.new attributes[0][1]
|
28
|
+
when /^link/
|
29
|
+
@topic.links << attributes[0][1]
|
30
|
+
when 'ExternalPage'
|
31
|
+
@priority = 0
|
32
|
+
@time = nil
|
33
|
+
@external = DmozSax::ExternalPage.new attributes[0][1]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def end_element name
|
38
|
+
|
39
|
+
case name
|
40
|
+
when 'catid'
|
41
|
+
@cid = @buffer.to_i
|
42
|
+
when 'd:Description'
|
43
|
+
@description = @buffer.strip
|
44
|
+
when 'd:Title'
|
45
|
+
@title = @buffer.strip.gsub('_', ' ')
|
46
|
+
when 'Topic'
|
47
|
+
@topic.cid = @cid
|
48
|
+
@on_topic.call(@topic) unless @on_topic.nil?
|
49
|
+
when 'topic'
|
50
|
+
@path = DmozSax::Path.new @buffer
|
51
|
+
when 'mediadate'
|
52
|
+
@time = @time_parser.time_from @buffer
|
53
|
+
when 'priority'
|
54
|
+
@priority = @buffer.to_i
|
55
|
+
when 'ExternalPage'
|
56
|
+
@external.priority = @priority
|
57
|
+
@external.title = @title
|
58
|
+
@external.description = @description
|
59
|
+
@external.path = @path
|
60
|
+
@external.time = @time
|
61
|
+
@on_external_page.call(@external) unless @on_external_page.nil?
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
|
3
|
+
module DmozSax
|
4
|
+
class Path < DelegateClass(Array)
|
5
|
+
|
6
|
+
attr_reader :name, :level
|
7
|
+
|
8
|
+
def initialize str, level = 0
|
9
|
+
resource = str.gsub('_', ' ').split(':')
|
10
|
+
|
11
|
+
@name = resource.first if resource.length == 2
|
12
|
+
|
13
|
+
unless resource.empty?
|
14
|
+
@path = resource.last.split('/').reject {|a| a =~ /^[A-Z]$/}
|
15
|
+
@path.shift if 'Top' == @path.first
|
16
|
+
else
|
17
|
+
@path = []
|
18
|
+
end
|
19
|
+
@level = level.to_i
|
20
|
+
super(@path.freeze)
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_a
|
24
|
+
@path.dup
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
@path.join('/')
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module DmozSax
|
4
|
+
class StructureDocument < Nokogiri::XML::SAX::Document
|
5
|
+
|
6
|
+
attr_accessor :on_topic, :on_alias
|
7
|
+
attr_accessor :name_parser, :time_parser
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
|
12
|
+
@name_parser = NameParser.new
|
13
|
+
@time_parser = TimeParser.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def characters string
|
17
|
+
@buffer ||= ""
|
18
|
+
@buffer << string
|
19
|
+
end
|
20
|
+
|
21
|
+
def start_element name, attributes = []
|
22
|
+
@buffer = ""
|
23
|
+
|
24
|
+
case name
|
25
|
+
when 'Topic'
|
26
|
+
@cid, @description, @title = nil, nil, nil
|
27
|
+
@topic = DmozSax::Topic.new attributes[0][1]
|
28
|
+
when 'Alias'
|
29
|
+
@alias = DmozSax::Alias.new attributes[0][1]
|
30
|
+
when 'Target'
|
31
|
+
@path = attributes[0][1]
|
32
|
+
when 'altlang'
|
33
|
+
@topic.alt_langs << DmozSax::Path.new(attributes[0][1])
|
34
|
+
when 'related'
|
35
|
+
@topic.related << DmozSax::Path.new(attributes[0][1])
|
36
|
+
when /^narrow/
|
37
|
+
@topic.narrows << DmozSax::Path.new(attributes[0][1], @name_parser.level_from(name))
|
38
|
+
when /^symbolic/
|
39
|
+
@topic.symbolics << DmozSax::Path.new(attributes[0][1], @name_parser.level_from(name))
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def end_element name
|
44
|
+
|
45
|
+
case name
|
46
|
+
when 'catid'
|
47
|
+
@cid = @buffer.to_i
|
48
|
+
when 'd:Description'
|
49
|
+
@description = @buffer.strip
|
50
|
+
when 'd:Title'
|
51
|
+
@title = @buffer.strip.gsub('_', ' ')
|
52
|
+
when 'lastUpdate'
|
53
|
+
@time = @time_parser.time_from @buffer
|
54
|
+
when 'Alias'
|
55
|
+
@on_alias.call(@alias) unless @on_alias.nil?
|
56
|
+
when 'Topic'
|
57
|
+
@topic.cid = @cid
|
58
|
+
@topic.title = @title
|
59
|
+
@topic.description = @description
|
60
|
+
@topic.time = @time
|
61
|
+
@on_topic.call(@topic) unless @on_topic.nil?
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module DmozSax
|
2
|
+
class Topic
|
3
|
+
attr_accessor :path, :cid, :title, :description, :time,
|
4
|
+
:narrows, :symbolics, :related, :alt_langs, :links
|
5
|
+
def initialize path_str
|
6
|
+
@path = DmozSax::Path.new path_str
|
7
|
+
@narrows, @symbolics, @related, @alt_langs, @links = [], [], [], [], []
|
8
|
+
@cid = nil
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
data/lib/dmoz_sax.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "dmoz_sax/version"
|
2
|
+
require "dmoz_sax/name_parser"
|
3
|
+
require "dmoz_sax/time_parser"
|
4
|
+
require "dmoz_sax/path"
|
5
|
+
require "dmoz_sax/alias"
|
6
|
+
require "dmoz_sax/topic"
|
7
|
+
require "dmoz_sax/external_page"
|
8
|
+
require "dmoz_sax/structure_document"
|
9
|
+
require "dmoz_sax/content_document"
|
10
|
+
|
11
|
+
module DmozSax
|
12
|
+
|
13
|
+
end
|
data/spec/alias_spec.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DmozSax::ContentDocument do
|
4
|
+
#it 'can parse a real content.rdf.u8 document' do
|
5
|
+
# parser = Nokogiri::XML::SAX::Parser.new(DmozSax::ContentDocument.new)
|
6
|
+
# parser.parse(File.open('/opt/data/DMOZ/content.rdf.u8'))
|
7
|
+
#end
|
8
|
+
|
9
|
+
it 'can parse a sample content.rdf.u8 document' do
|
10
|
+
|
11
|
+
topics = []
|
12
|
+
pages = []
|
13
|
+
|
14
|
+
document = DmozSax::ContentDocument.new
|
15
|
+
document.on_topic = lambda {|t| topics << t }
|
16
|
+
document.on_external_page = lambda {|t| pages << t }
|
17
|
+
|
18
|
+
parser = Nokogiri::XML::SAX::Parser.new(document)
|
19
|
+
parser.parse(File.open('spec/samples/content_sample.rdf.u8'))
|
20
|
+
end
|
21
|
+
end
|
data/spec/path_spec.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DmozSax::Path do
|
4
|
+
it "takes a / delimited string in its initializer" do
|
5
|
+
path = DmozSax::Path.new('This/Topic/Path')
|
6
|
+
path.to_a.should == ['This','Topic','Path']
|
7
|
+
end
|
8
|
+
|
9
|
+
it "removes the 'Top' category and English index categories (e.g. 'a' to 'z')" do
|
10
|
+
|
11
|
+
('A'..'Z').each do |char|
|
12
|
+
path = DmozSax::Path.new("Top/This/Topic/#{ char }/Path")
|
13
|
+
path.to_a.should == ['This','Topic','Path']
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
it "may optionally be preceeded by a name or identifier" do
|
18
|
+
path = DmozSax::Path.new("Sample_Directory:Top/This/Topic/Path")
|
19
|
+
path.name.should == 'Sample Directory'
|
20
|
+
path.to_a.should == ['This','Topic','Path']
|
21
|
+
end
|
22
|
+
|
23
|
+
context "as an immutable array" do
|
24
|
+
|
25
|
+
it "supports enumeration methods" do
|
26
|
+
path = DmozSax::Path.new 'This/Topic/Path'
|
27
|
+
path.length.should == 3
|
28
|
+
path.count.should == 3
|
29
|
+
path.size.should == 3
|
30
|
+
|
31
|
+
path.each do |a| a.should_not be_nil end
|
32
|
+
path.map {|a| a.downcase}.should == ['this','topic','path']
|
33
|
+
path.inject(0) {|i,a| i += a.length}.should == 13
|
34
|
+
end
|
35
|
+
|
36
|
+
it "throws exceptions if modification attempted" do
|
37
|
+
expect { path[0] = 'Bob' }.to raise_error
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<RDF xmlns:r="http://www.w3.org/TR/RDF/" xmlns:d="http://purl.org/dc/elements/1.0/" xmlns="http://dmoz.org/rdf/">
|
3
|
+
<!-- Generated at 2013-01-27 00:03:25 EST from DMOZ 2.0 -->
|
4
|
+
<Topic r:id="">
|
5
|
+
<catid>1</catid>
|
6
|
+
</Topic>
|
7
|
+
<Topic r:id="Top/Arts">
|
8
|
+
<catid>381773</catid>
|
9
|
+
</Topic>
|
10
|
+
<Topic r:id="Top/Arts/Animation">
|
11
|
+
<catid>423945</catid>
|
12
|
+
<link1 r:resource="http://www.awn.com/"></link1>
|
13
|
+
<link r:resource="http://animation.about.com/"></link>
|
14
|
+
<link r:resource="http://www.toonhound.com/"></link>
|
15
|
+
<link r:resource="http://enculturation.gmu.edu/2_1/pisters.html"></link>
|
16
|
+
<link r:resource="http://www.digitalmediafx.com/Features/animationhistory.html"></link>
|
17
|
+
<link r:resource="http://www.spark-online.com/august00/media/romano.html"></link>
|
18
|
+
<link r:resource="http://www.animated-divots.net/"></link>
|
19
|
+
</Topic>
|
20
|
+
<ExternalPage about="http://www.awn.com/">
|
21
|
+
<d:Title>Animation World Network</d:Title>
|
22
|
+
<d:Description>Provides information resources to the international animation community. Features include searchable database archives, monthly magazine, web animation guide, the Animation Village, discussion forums and other useful resources.</d:Description>
|
23
|
+
<priority>1</priority>
|
24
|
+
<topic>Top/Arts/Animation</topic>
|
25
|
+
</ExternalPage>
|
26
|
+
<ExternalPage about="http://animation.about.com/">
|
27
|
+
<d:Title>About.com: Animation Guide</d:Title>
|
28
|
+
<d:Description>Keep up with developments in online animation for all skill levels. Download tools, and seek inspiration from online work.</d:Description>
|
29
|
+
<topic>Top/Arts/Animation</topic>
|
30
|
+
</ExternalPage>
|
31
|
+
<ExternalPage about="http://www.toonhound.com/">
|
32
|
+
<d:Title>Toonhound</d:Title>
|
33
|
+
<d:Description>British cartoon, animation and comic strip creations - links, reviews and news from the UK.</d:Description>
|
34
|
+
<topic>Top/Arts/Animation</topic>
|
35
|
+
</ExternalPage>
|
36
|
+
<ExternalPage about="http://enculturation.gmu.edu/2_1/pisters.html">
|
37
|
+
<d:Title>Enculturation: From Mouse to Mouse: Overcoming Information</d:Title>
|
38
|
+
<d:Description>Essay by Patricia Pisters on the animated image and its changing relationship with the cinematic image.</d:Description>
|
39
|
+
<topic>Top/Arts/Animation</topic>
|
40
|
+
</ExternalPage>
|
41
|
+
</RDF>
|
@@ -0,0 +1,48 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<RDF xmlns:r="http://www.w3.org/TR/RDF/" xmlns:d="http://purl.org/dc/elements/1.0/" xmlns="http://dmoz.org/rdf/">
|
3
|
+
<!--
|
4
|
+
|
5
|
+
DMOZ Content is licensed under the Creative Commons
|
6
|
+
Attribution 3.0 Unported (CC BY 3.0)
|
7
|
+
( http://creativecommons.org/licenses/by/3.0/ )
|
8
|
+
|
9
|
+
|
10
|
+
-->
|
11
|
+
<Topic r:id="Top/World">
|
12
|
+
<catid>2</catid>
|
13
|
+
<d:Title>Top</d:Title>
|
14
|
+
<lastUpdate>2010-08-16 06:05:15</lastUpdate>
|
15
|
+
<d:Description></d:Description>
|
16
|
+
<narrow r:resource="Top/Arts"></narrow>
|
17
|
+
<related r:resource="Kids_and_Teens"></related>
|
18
|
+
</Topic>
|
19
|
+
<Topic r:id="Top/Arts">
|
20
|
+
<catid>381773</catid>
|
21
|
+
<d:Title>Arts</d:Title>
|
22
|
+
<lastUpdate>2011-08-06 17:20:44</lastUpdate>
|
23
|
+
<d:Description><img src="/img/moz/mzcolor.gif" alt="Image from Mozilla museum: Mozilla as an Artist" width="128" height="120" align="right"> <p>The ODP <b>Arts</b> category contains English language sites about art, or "the use of skill and imagination in the creation of aesthetic objects, environments, or experiences that can be shared with others." This includes the "liberal arts," concerned with skill of expression in language, speech, and reasoning, and the "fine arts," concerned with affecting aesthetics directly, and especially affecting the sense of beauty. <small>(Quotes and paraphrases from <a href="http://www.britannica.com/">Britannica.com</a>)</small> <p>Art is an abstract and subjective quality: It can be studied, but cannot be objectively measured, counted, weighed, or absolutely compared; it can only appeal to the viewers or audience's personal senses.</d:Description>
|
24
|
+
<narrow1 r:resource="Top/Arts/Directories"></narrow1>
|
25
|
+
<narrow1 r:resource="Top/Arts/News_and_Media"></narrow1>
|
26
|
+
<narrow1 r:resource="Top/Arts/Weblogs"></narrow1>
|
27
|
+
<narrow1 r:resource="Top/Arts/Chats_and_Forums"></narrow1>
|
28
|
+
<narrow2 r:resource="Top/Arts/Art_History"></narrow2>
|
29
|
+
<narrow2 r:resource="Top/Arts/Crafts"></narrow2>
|
30
|
+
<altlang r:resource="Tamil:Top/World/Tamil/கலை"></altlang>
|
31
|
+
<altlang r:resource="English:Top/World/O'zbekcha/San’at"></altlang>
|
32
|
+
<altlang r:resource="Euskara:Top/World/Euskara/Kultura"></altlang>
|
33
|
+
<altlang r:resource="Rumantsch:Top/World/Rumantsch/Art"></altlang>
|
34
|
+
<symbolic1 r:resource="Publishers:Top/Business/Publishing_and_Printing/Publishing/Books/Arts"></symbolic1>
|
35
|
+
<related r:resource="Top/Business/Arts_and_Entertainment"></related>
|
36
|
+
<symbolic2 r:resource="Native_and_Tribal:Top/Arts/Visual_Arts/Native_and_Tribal"></symbolic2>
|
37
|
+
<symbolic2 r:resource="Typography:Top/Arts/Graphic_Design/Typography"></symbolic2>
|
38
|
+
<related r:resource="Kids_and_Teens/Arts"></related>
|
39
|
+
</Topic>
|
40
|
+
<Alias r:id="Publishers:Top/Business/Publishing_and_Printing/Publishing/Books/Arts">
|
41
|
+
<d:Title>Publishers</d:Title>
|
42
|
+
<Target r:resource="Top/Business/Publishing_and_Printing/Publishing/Books/Arts"/>
|
43
|
+
</Alias>
|
44
|
+
<Alias r:id="Native_and_Tribal:Top/Arts/Visual_Arts/Native_and_Tribal">
|
45
|
+
<d:Title>Native_and_Tribal</d:Title>
|
46
|
+
<Target r:resource="Top/Arts/Visual_Arts/Native_and_Tribal"/>
|
47
|
+
</Alias>
|
48
|
+
</RDF>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DmozSax::StructureDocument do
|
4
|
+
#it 'can parse a real structure.rdf.u8 document' do
|
5
|
+
# parser = Nokogiri::XML::SAX::Parser.new(DmozSax::StructureDocument.new)
|
6
|
+
# parser.parse(File.open('/opt/data/DMOZ/structure.rdf.u8'))
|
7
|
+
#end
|
8
|
+
|
9
|
+
it 'can parse a sample structure.rdf.u8 document' do
|
10
|
+
|
11
|
+
topics = []
|
12
|
+
aliases = []
|
13
|
+
|
14
|
+
document = DmozSax::StructureDocument.new
|
15
|
+
document.on_topic = lambda {|t| topics << t }
|
16
|
+
|
17
|
+
document.on_alias = lambda {|a| aliases << a }
|
18
|
+
|
19
|
+
parser = Nokogiri::XML::SAX::Parser.new(document)
|
20
|
+
parser.parse(File.open('spec/samples/structure_sample.rdf.u8'))
|
21
|
+
topics.count.should == 2
|
22
|
+
topics[1].title.should == 'Arts'
|
23
|
+
topics[1].path.should == ['Arts']
|
24
|
+
topics[1].description.should include 'aesthetic objects'
|
25
|
+
topics[1].cid.should == 381773
|
26
|
+
|
27
|
+
aliases.count.should == 2
|
28
|
+
aliases[0].title.should == 'Publishers'
|
29
|
+
aliases[0].path.should == ['Business','Publishing and Printing','Publishing','Books','Arts']
|
30
|
+
end
|
31
|
+
end
|
data/spec/topic_spec.rb
ADDED
metadata
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: dmoz_sax_doc
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Galen Palmer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.5'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.5'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: json
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '1.7'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '1.7'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rspec
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '2.12'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.12'
|
62
|
+
description: Use a SAX parser to visit either the structure.u8 or content.u8 DMOZ
|
63
|
+
files.
|
64
|
+
email:
|
65
|
+
- palmergs@gmail.com
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- Gemfile
|
71
|
+
- Gemfile.lock
|
72
|
+
- README.md
|
73
|
+
- Rakefile
|
74
|
+
- dmoz_sax_doc.gemspec
|
75
|
+
- lib/dmoz_sax.rb
|
76
|
+
- lib/dmoz_sax/alias.rb
|
77
|
+
- lib/dmoz_sax/content_document.rb
|
78
|
+
- lib/dmoz_sax/external_page.rb
|
79
|
+
- lib/dmoz_sax/name_parser.rb
|
80
|
+
- lib/dmoz_sax/path.rb
|
81
|
+
- lib/dmoz_sax/structure_document.rb
|
82
|
+
- lib/dmoz_sax/time_parser.rb
|
83
|
+
- lib/dmoz_sax/topic.rb
|
84
|
+
- lib/dmoz_sax/version.rb
|
85
|
+
- spec/alias_spec.rb
|
86
|
+
- spec/content_document_spec.rb
|
87
|
+
- spec/external_page_spec.rb
|
88
|
+
- spec/path_spec.rb
|
89
|
+
- spec/samples/content_sample.rdf.u8
|
90
|
+
- spec/samples/structure_sample.rdf.u8
|
91
|
+
- spec/spec_helper.rb
|
92
|
+
- spec/structure_document_spec.rb
|
93
|
+
- spec/topic_spec.rb
|
94
|
+
- spec/version_spec.rb
|
95
|
+
homepage: https://github.com/palmergs/dmoz_sax_doc
|
96
|
+
licenses: []
|
97
|
+
post_install_message:
|
98
|
+
rdoc_options: []
|
99
|
+
require_paths:
|
100
|
+
- lib
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
109
|
+
requirements:
|
110
|
+
- - '>='
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
113
|
+
requirements: []
|
114
|
+
rubyforge_project:
|
115
|
+
rubygems_version: 1.8.25
|
116
|
+
signing_key:
|
117
|
+
specification_version: 3
|
118
|
+
summary: SAX visitor for DMOZ structure of content files.
|
119
|
+
test_files:
|
120
|
+
- spec/alias_spec.rb
|
121
|
+
- spec/content_document_spec.rb
|
122
|
+
- spec/external_page_spec.rb
|
123
|
+
- spec/path_spec.rb
|
124
|
+
- spec/samples/content_sample.rdf.u8
|
125
|
+
- spec/samples/structure_sample.rdf.u8
|
126
|
+
- spec/spec_helper.rb
|
127
|
+
- spec/structure_document_spec.rb
|
128
|
+
- spec/topic_spec.rb
|
129
|
+
- spec/version_spec.rb
|