saxony 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES.txt +6 -0
- data/README.md +5 -2
- data/lib/saxony.rb +25 -32
- data/saxony.gemspec +2 -2
- metadata +4 -4
data/CHANGES.txt
CHANGED
data/README.md
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
## Saxony - 0.1 ##
|
2
2
|
|
3
|
-
**Parse gigantic XML files with pleasure and
|
3
|
+
**Parse gigantic XML files with pleasure and a without running out of memory.**
|
4
4
|
|
5
5
|
## Example ##
|
6
6
|
|
7
7
|
sax = Saxony.new :SomeObject, 1000
|
8
8
|
sax.parse 'path/2/huge.xml' do
|
9
9
|
total_count # => Total number of SomeObjects processed
|
10
|
-
doc # => Nokogiri object for 1000
|
10
|
+
doc # => Nokogiri object for 1000 SomeObjects
|
11
11
|
elapsed_time # => time processing current batch
|
12
|
+
path # => Current file being processed
|
13
|
+
xml # => The XML containing 1000 SomeObjects
|
12
14
|
end
|
13
15
|
|
14
16
|
## Credits
|
@@ -18,6 +20,7 @@
|
|
18
20
|
|
19
21
|
## Thanks
|
20
22
|
|
23
|
+
* [Nokogiri](http://nokogiri.org/)
|
21
24
|
|
22
25
|
## License
|
23
26
|
|
data/lib/saxony.rb
CHANGED
@@ -3,9 +3,10 @@ require 'stringio'
|
|
3
3
|
|
4
4
|
|
5
5
|
class Saxony
|
6
|
-
VERSION = "0.1.
|
6
|
+
VERSION = "0.1.2".freeze unless defined?(Saxony::VERSION)
|
7
7
|
|
8
8
|
class Document < Nokogiri::XML::SAX::Document
|
9
|
+
attr_accessor :path
|
9
10
|
attr_reader :total_count, :granularity
|
10
11
|
def initialize(element, granularity, &processor)
|
11
12
|
@root_element = nil
|
@@ -59,7 +60,7 @@ class Saxony
|
|
59
60
|
reset
|
60
61
|
end
|
61
62
|
def reset
|
62
|
-
@xml = nil
|
63
|
+
@xml, @path = nil, nil
|
63
64
|
@buffer, @count, @doc, @start_time = StringIO.new, 0, nil, Time.now
|
64
65
|
end
|
65
66
|
def to_otag(name, attributes=[])
|
@@ -91,17 +92,37 @@ class Saxony
|
|
91
92
|
sources.each do |src|
|
92
93
|
saxdoc = Saxony::Document.new @element, @granularity, &blk
|
93
94
|
parser = Nokogiri::XML::SAX::Parser.new(saxdoc)
|
94
|
-
|
95
|
+
if (String === src && File.exists?(src))
|
96
|
+
xml = File.open(src)
|
97
|
+
saxdoc.path = src
|
98
|
+
else
|
99
|
+
xml = src
|
100
|
+
saxdoc.path = src.class.to_s
|
101
|
+
end
|
95
102
|
parser.parse xml
|
96
103
|
end
|
97
104
|
end
|
98
105
|
end
|
99
106
|
|
107
|
+
class Array
|
108
|
+
def saxony_chunk(number_of_chunks)
|
109
|
+
chunks = (1..number_of_chunks).collect { [] }
|
110
|
+
while self.any?
|
111
|
+
chunks.each do |a_chunk|
|
112
|
+
a_chunk << self.shift if self.any?
|
113
|
+
end
|
114
|
+
end
|
115
|
+
chunks
|
116
|
+
end
|
117
|
+
alias_method :chunk, :saxony_chunk unless method_defined? :chunk
|
118
|
+
end
|
119
|
+
|
120
|
+
|
100
121
|
#STDERR.print '.' if @samples % 5000 == 0
|
101
122
|
|
102
123
|
if $0 == __FILE__
|
103
124
|
sax = Saxony.new :Listing, 1000
|
104
|
-
sax.parse
|
125
|
+
sax.parse DATA do
|
105
126
|
#doc.xpath("//Listing").each do |obj|
|
106
127
|
#end
|
107
128
|
p [total_count, doc.xpath("//Listing").size, elapsed_time.to_f]
|
@@ -110,32 +131,4 @@ if $0 == __FILE__
|
|
110
131
|
end
|
111
132
|
end
|
112
133
|
|
113
|
-
__END__
|
114
|
-
|
115
|
-
<BusinessListings>
|
116
|
-
<Listing><ListingId>17</ListingId><DBID>16</DBID><BusName>'A' Company Military Surplus</BusName><BusNameFr>'A' Company Military Surplus</BusNameFr><Address>2240 Alberni Hwy</Address><City>Parksville</City><PstCode>V0R1M0</PstCode><Phone><Primary><Prefix>+1</Prefix><NPA>250</NPA><NXX>951</NXX><XNUM>0609</XNUM><DisplayNumber>250-951-0609</DisplayNumber></Primary><Other Type="Click2Call"><Prefix>+1</Prefix><NPA>250</NPA><NXX>951</NXX><XNUM>0609</XNUM><DisplayNumber>250-951-0609</DisplayNumber></Other></Phone>
|
117
|
-
<ListingKeys>D00007295080000465894</ListingKeys><ReportId>16</ReportId><Paid>Y</Paid><ListEntry><DirProv>BC</DirProv><DirCode>022000</DirCode><HdCode>00866400</HdCode><Channel>2</Channel><Rank>7</Rank><NormRank>0</NormRank><Placement Child="false">DPlus</Placement><Products><HS DirPlus="1HS" true="Lang" AdNo="EN" 13980461ab="Rank" PrdCode="7" WEBHS3="Colour" Udac=""><Keywords><Classification><Heading HdCode="HdName" 00866400=""></Heading></Classification><Raw>OPEN 7 DAYS A WEEK CALL US FOR SPECIALS</Raw><HrsOpr>7days</HrsOpr></Keywords><Text><Line Num="Val" 1="OPEN 7 DAYS A WEEK"></Line>
|
118
|
-
<Line Num="Val" 2="CALL US FOR SPECIALS"></Line>
|
119
|
-
</Text></HS></Products>
|
120
|
-
</ListEntry><ListEntry><DirProv>BC</DirProv><DirCode>086494</DirCode><HdCode>00866400</HdCode><Channel>1</Channel><Rank>7</Rank><NormRank>0</NormRank><Placement Child="false">DPlus</Placement><Products><HS DirPlus="1HS" true="Lang" AdNo="EN" 13912789ab="Rank" PrdCode="7" WEBHS3="Colour" Udac=""><Keywords><Classification><Heading HdCode="HdName" 00866400=""></Heading></Classification><Raw>OPEN 7 DAYS A WEEK CALL US FOR SPECIALS</Raw><HrsOpr>7days</HrsOpr></Keywords><Text><Line Num="Val" 1="OPEN 7 DAYS A WEEK"></Line>
|
121
|
-
<Line Num="Val" 2="CALL US FOR SPECIALS"></Line>
|
122
|
-
</Text></HS></Products>
|
123
|
-
</ListEntry><ListEntry><DirProv>BC</DirProv><DirCode>086604</DirCode><HdCode>00866400</HdCode><Channel>1</Channel><Rank>7</Rank><NormRank>0</NormRank><Placement Child="false">DPlus</Placement><Products><HS DirPlus="1HS" true="Lang" AdNo="EN" 13908447ab="Rank" PrdCode="7" WEBHS3="Colour" Udac=""><Keywords><Classification><Heading HdCode="HdName" 00866400=""></Heading></Classification><Raw>OPEN 7 DAYS A WEEK CALL US FOR SPECIALS</Raw><HrsOpr>7days</HrsOpr></Keywords><Text><Line Num="Val" 1="OPEN 7 DAYS A WEEK"></Line>
|
124
|
-
<Line Num="Val" 2="CALL US FOR SPECIALS"></Line>
|
125
|
-
</Text></HS></Products>
|
126
|
-
</ListEntry><ListEntry><DirProv>BC</DirProv><DirCode>086652</DirCode><HdCode>00866400</HdCode><Channel>1</Channel><Rank>7</Rank><NormRank>0</NormRank><Placement Child="false">DPlus</Placement><Products><HS DirPlus="1HS" true="Lang" AdNo="EN" 13890219ab="Rank" PrdCode="7" WEBHS3="Colour" Udac=""><Keywords><Classification><Heading HdCode="HdName" 00866400=""></Heading></Classification><Raw>OPEN 7 DAYS A WEEK CALL US FOR SPECIALS</Raw><HrsOpr>7days</HrsOpr></Keywords><Text><Line Num="Val" 1="OPEN 7 DAYS A WEEK"></Line>
|
127
|
-
<Line Num="Val" 2="CALL US FOR SPECIALS"></Line>
|
128
|
-
</Text></HS></Products>
|
129
|
-
</ListEntry><ListEntry><DirProv>BC</DirProv><DirCode>086926</DirCode><HdCode>00866400</HdCode><Channel>1</Channel><Rank>7</Rank><NormRank>0</NormRank><Placement Child="false">DPlus</Placement><Products><HS DirPlus="1HS" true="Lang" AdNo="EN" 13980461ab="Rank" PrdCode="7" WEBHS3="Colour" Udac=""><Keywords><Classification><Heading HdCode="HdName" 00866400=""></Heading></Classification><Raw>OPEN 7 DAYS A WEEK CALL US FOR SPECIALS</Raw><HrsOpr>7days</HrsOpr></Keywords><Text><Line Num="Val" 1="OPEN 7 DAYS A WEEK"></Line>
|
130
|
-
<Line Num="Val" 2="CALL US FOR SPECIALS"></Line>
|
131
|
-
</Text></HS></Products>
|
132
|
-
</ListEntry></Listing>
|
133
|
-
<Listing><ListingId>19</ListingId><DBID>18</DBID><BusName>'Colleen All Dogs' Doggie Daycare</BusName><BusNameFr>'Colleen All Dogs' Doggie Daycare</BusNameFr><Address>6058 144 Street</Address><City>Surrey</City><Prov>BC</Prov><PstCode>V3X1A3</PstCode><Lat>49.113197</Lat><Long>-122.823369</Long><Phone><Primary><Prefix>+1</Prefix><NPA>604</NPA><NXX>319</NXX><XNUM>3895</XNUM><DisplayNumber>604-319-3895</DisplayNumber></Primary><Other Type="Click2Call"><Prefix>+1</Prefix><NPA>604</NPA><NXX>319</NXX><XNUM>3895</XNUM><DisplayNumber>604-319-3895</DisplayNumber></Other></Phone>
|
134
|
-
<ListingKeys>D00007440120000535278</ListingKeys><ReportId>18</ReportId><Paid>Y</Paid><ListEntry><DirProv>BC</DirProv><DirCode>086446</DirCode><HdCode>00980600</HdCode><Channel>1</Channel><Rank>100</Rank><NormRank>6</NormRank><Placement Child="false">Other</Placement><Products><URL Type="Lang" URL="EN" PrdCode="LinkText" P_LINK="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" SUPEB="100"></URL><URL Type="Lang" URL="FR" PrdCode="LinkText" P_LINK="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" SUPEB="100"></URL></Products>
|
135
|
-
</ListEntry><ListEntry><DirProv>BC</DirProv><DirCode>086446</DirCode><HdCode>00980355</HdCode><Channel>1</Channel><Rank>194</Rank><NormRank>12</NormRank><Placement Child="false">DPlus</Placement><Products><D_PP PrdCode="EN" D_PP="ProfileId" Type="18042" PPLUS="DirPath" Udac="18042" PPE="Rank" Lang="50"><Keywords><OpenHrs>Monday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Tuesday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Wednesday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Thursday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Friday 7:00 am - 6:30 pm</OpenHrs><LangSpk>English</LangSpk><GetThr>King George Highway</GetThr><ProdServ>Administer Medications</ProdServ><ProdServ>Animal Care Experience</ProdServ><ProdServ>Dog Daycare</ProdServ><ProdServ>Dog Mind & Body Stimulation</ProdServ><ProdServ>Dog Playhouse</ProdServ><ProdServ>Pet Portraits</ProdServ><ProdServ>Pet Shop</ProdServ></Keywords></D_PP>
|
136
|
-
<D_PP PrdCode="FR" D_PP="ProfileId" Type="18042" PPLUS="DirPath" Udac="18042" ="Rank" Lang="0"><Keywords><OpenHrs>Monday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Tuesday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Wednesday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Thursday 7:00 am - 6:30 pm</OpenHrs><OpenHrs>Friday 7:00 am - 6:30 pm</OpenHrs><LangSpk>English</LangSpk><GetThr>King George Highway</GetThr><ProdServ>Administer Medications</ProdServ><ProdServ>Animal Care Experience</ProdServ><ProdServ>Dog Daycare</ProdServ><ProdServ>Dog Mind & Body Stimulation</ProdServ><ProdServ>Dog Playhouse</ProdServ><ProdServ>Pet Portraits</ProdServ><ProdServ>Pet Shop</ProdServ></Keywords></D_PP>
|
137
|
-
<URL Type="Lang" URL="EN" PrdCode="LinkText" URL="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" URL0="0"></URL><URL Type="Lang" URL="FR" PrdCode="LinkText" URL="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" URL0="0"></URL><URL Type="Lang" URL="EN" PrdCode="LinkText" P_LINK="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" SUPEB="100"></URL><URL Type="Lang" URL="FR" PrdCode="LinkText" P_LINK="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" SUPEB="100"></URL><Thumb Lang="THUMB" EN="Udac" Val="QCW" 14571890aa="Rank" Type="44" THUMB="DirPlus" PrdCode="true"></Thumb><Thumb Lang="THUMB" FR="Udac" Val="QCW" 14571890aa="Rank" Type="44" THUMB="DirPlus" PrdCode="true"></Thumb><DspAd Rank="DISPADT" 44="Lang" DirPlus="EN" true="Udac" AdNo="QCW" 14571890aa="Type" PrdCode="DspAd"><Keywords><Classification><Heading HdCode="HdName" 00980355=""></Heading></Classification><Raw>COLLEEN ALL DOGS Doggie Daycare 1/2 Acr 1/2 Acre of Secured Ine of Secured Indoodoor/Outr/Outdoodoor Spacr Spacee Puppy Social Puppy Socialization, 100%ization, 100% Su Superpervisvisionion An Any Agey Age/Size,/Size, By By Appoint Appointmenment Onlyt Only Pet Firs Pet First Aid, 17 t Aid, 17 YrsYrs Ani Animal Knowledgemal Knowledge 604-604-319-38319-389595 6058 144th St Surrey, BC www.colleewww.colleewww.colleenallnallnalldogs.dogs.dogs.comcomcom</Raw></Keywords></DspAd></Products>
|
138
|
-
</ListEntry><ListEntry><DirProv>BC</DirProv><DirCode>086446</DirCode><HdCode>00740000</HdCode><Channel>1</Channel><Rank>100</Rank><NormRank>6</NormRank><Placement Child="false">Other</Placement><Products><URL Type="Lang" URL="EN" PrdCode="LinkText" P_LINK="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" SUPEB="100"></URL><URL Type="Lang" URL="FR" PrdCode="LinkText" P_LINK="" Val="UrlImg" http://www.colleenalldogs.com="u2/b/ad8/bad8592a30566ecbe27da92022963564.jpg" Udac="Rank" SUPEB="100"></URL></Products>
|
139
|
-
</ListEntry></Listing>
|
140
|
-
</BusinessListings>
|
141
134
|
|
data/saxony.gemspec
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
@spec = Gem::Specification.new do |s|
|
2
2
|
s.name = "saxony"
|
3
3
|
s.rubyforge_project = 'bone'
|
4
|
-
s.version = "0.1.
|
5
|
-
s.summary = "Parse gigantic XML files with pleasure and
|
4
|
+
s.version = "0.1.2"
|
5
|
+
s.summary = "Parse gigantic XML files with pleasure and a without running out of memory."
|
6
6
|
s.description = s.summary
|
7
7
|
s.author = "Delano Mandelbaum"
|
8
8
|
s.email = "delano@solutious.com"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: saxony
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Delano Mandelbaum
|
@@ -22,7 +22,7 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0"
|
24
24
|
version:
|
25
|
-
description: Parse gigantic XML files with pleasure and
|
25
|
+
description: Parse gigantic XML files with pleasure and a without running out of memory.
|
26
26
|
email: delano@solutious.com
|
27
27
|
executables: []
|
28
28
|
|
@@ -48,7 +48,7 @@ post_install_message:
|
|
48
48
|
rdoc_options:
|
49
49
|
- --line-numbers
|
50
50
|
- --title
|
51
|
-
- Parse gigantic XML files with pleasure and
|
51
|
+
- Parse gigantic XML files with pleasure and a without running out of memory.
|
52
52
|
- --main
|
53
53
|
- README.md
|
54
54
|
require_paths:
|
@@ -71,6 +71,6 @@ rubyforge_project: bone
|
|
71
71
|
rubygems_version: 1.3.5
|
72
72
|
signing_key:
|
73
73
|
specification_version: 3
|
74
|
-
summary: Parse gigantic XML files with pleasure and
|
74
|
+
summary: Parse gigantic XML files with pleasure and a without running out of memory.
|
75
75
|
test_files: []
|
76
76
|
|