bliss 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ == 0.0.9
2
+
3
+ * Features
4
+
5
+ * added support for Gzip (content type is autodetected).
6
+ * on_max_unhandled_bytes callback block. Which receives the amount of bytes and a block to execute when that limit is reached.
7
+ * on_tag_close maybe used without a specific tag name, and block now handles "current depth" array.
8
+ * Featuring changelog.
9
+
10
+ * Bugfixes
11
+
12
+ * When XML is too small, it ended through http.callback block, and did not closed the file. That lead to errors and is fixed in this version.
data/Gemfile CHANGED
@@ -10,8 +10,8 @@ gem "em-http-request", ">= 1.0.2"
10
10
  # Add dependencies to develop your gem here.
11
11
  # Include everything needed to run rake, tests, features, etc.
12
12
  group :development do
13
- gem "shoulda", ">= 0"
14
- gem "bundler", "~> 1.0.0"
13
+ gem "rspec", "~> 2.8.0"
14
+ gem "bundler", "~> 1.1.3"
15
15
  gem "jeweler", "~> 1.6.4"
16
16
  gem "simplecov"
17
17
  #gem "rcov", ">= 0"
@@ -3,6 +3,7 @@ GEM
3
3
  specs:
4
4
  addressable (2.2.7)
5
5
  cookiejar (0.3.0)
6
+ diff-lcs (1.1.3)
6
7
  em-http-request (1.0.2)
7
8
  addressable (>= 2.2.3)
8
9
  cookiejar
@@ -21,11 +22,14 @@ GEM
21
22
  multi_json (1.1.0)
22
23
  nokogiri (1.5.2)
23
24
  rake (0.9.2.2)
24
- shoulda (3.0.1)
25
- shoulda-context (~> 1.0.0)
26
- shoulda-matchers (~> 1.0.0)
27
- shoulda-context (1.0.0)
28
- shoulda-matchers (1.0.0)
25
+ rspec (2.8.0)
26
+ rspec-core (~> 2.8.0)
27
+ rspec-expectations (~> 2.8.0)
28
+ rspec-mocks (~> 2.8.0)
29
+ rspec-core (2.8.0)
30
+ rspec-expectations (2.8.0)
31
+ diff-lcs (~> 1.1.2)
32
+ rspec-mocks (2.8.0)
29
33
  simplecov (0.6.1)
30
34
  multi_json (~> 1.0)
31
35
  simplecov-html (~> 0.5.3)
@@ -35,10 +39,10 @@ PLATFORMS
35
39
  ruby
36
40
 
37
41
  DEPENDENCIES
38
- bundler (~> 1.0.0)
42
+ bundler (~> 1.1.3)
39
43
  em-http-request (>= 1.0.2)
40
44
  eventmachine (>= 1.0.0.beta.4)
41
45
  jeweler (~> 1.6.4)
42
46
  nokogiri (>= 1.5.2)
43
- shoulda
47
+ rspec (~> 2.8.0)
44
48
  simplecov
data/Rakefile CHANGED
@@ -26,6 +26,12 @@ Jeweler::Tasks.new do |gem|
26
26
  end
27
27
  Jeweler::RubygemsDotOrgTasks.new
28
28
 
29
+ require 'rspec/core'
30
+ require 'rspec/core/rake_task'
31
+ RSpec::Core::RakeTask.new(:spec) do |spec|
32
+ spec.pattern = FileList['spec/**/*_spec.rb']
33
+ end
34
+
29
35
  require 'rake/testtask'
30
36
  Rake::TestTask.new(:test) do |test|
31
37
  test.libs << 'lib' << 'test'
@@ -41,7 +47,7 @@ end
41
47
  # test.rcov_opts << '--exclude "gems/*"'
42
48
  #end
43
49
 
44
- task :default => :test
50
+ task :default => :spec
45
51
 
46
52
  require 'rake/rdoctask'
47
53
  Rake::RDocTask.new do |rdoc|
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.7
1
+ 0.0.9
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bliss"
8
- s.version = "0.0.7"
8
+ s.version = "0.0.9"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Fernando Alonso"]
12
- s.date = "2012-03-21"
12
+ s.date = "2012-06-04"
13
13
  s.description = "streamed xml parsing tool"
14
14
  s.email = "krakatoa1987@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -18,6 +18,7 @@ Gem::Specification.new do |s|
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
21
+ "CHANGELOG.rdoc",
21
22
  "Gemfile",
22
23
  "Gemfile.lock",
23
24
  "LICENSE.txt",
@@ -25,13 +26,20 @@ Gem::Specification.new do |s|
25
26
  "Rakefile",
26
27
  "VERSION",
27
28
  "bliss.gemspec",
29
+ "complete_test.rb",
30
+ "gzip_support.rb",
28
31
  "hash.rb",
29
32
  "http-machine.rb",
30
33
  "lib/bliss.rb",
34
+ "lib/bliss/constraint.rb",
35
+ "lib/bliss/encoding_error.rb",
36
+ "lib/bliss/format.rb",
31
37
  "lib/bliss/parser.rb",
32
38
  "lib/bliss/parser_machine.rb",
33
- "lib/bliss/sax_parser.rb",
34
39
  "lib/hash_extension.rb",
40
+ "spec.yml",
41
+ "spec/format_spec.rb",
42
+ "spec/spec_helper.rb",
35
43
  "test.rb",
36
44
  "test/helper.rb",
37
45
  "test/test_bliss.rb"
@@ -39,7 +47,7 @@ Gem::Specification.new do |s|
39
47
  s.homepage = "http://github.com/krakatoa/bliss"
40
48
  s.licenses = ["MIT"]
41
49
  s.require_paths = ["lib"]
42
- s.rubygems_version = "1.8.15"
50
+ s.rubygems_version = "1.8.10"
43
51
  s.summary = "streamed xml parsing tool"
44
52
 
45
53
  if s.respond_to? :specification_version then
@@ -49,16 +57,16 @@ Gem::Specification.new do |s|
49
57
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.2"])
50
58
  s.add_runtime_dependency(%q<eventmachine>, [">= 1.0.0.beta.4"])
51
59
  s.add_runtime_dependency(%q<em-http-request>, [">= 1.0.2"])
52
- s.add_development_dependency(%q<shoulda>, [">= 0"])
53
- s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
61
+ s.add_development_dependency(%q<bundler>, ["~> 1.1.3"])
54
62
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
55
63
  s.add_development_dependency(%q<simplecov>, [">= 0"])
56
64
  else
57
65
  s.add_dependency(%q<nokogiri>, [">= 1.5.2"])
58
66
  s.add_dependency(%q<eventmachine>, [">= 1.0.0.beta.4"])
59
67
  s.add_dependency(%q<em-http-request>, [">= 1.0.2"])
60
- s.add_dependency(%q<shoulda>, [">= 0"])
61
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
68
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
69
+ s.add_dependency(%q<bundler>, ["~> 1.1.3"])
62
70
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
63
71
  s.add_dependency(%q<simplecov>, [">= 0"])
64
72
  end
@@ -66,8 +74,8 @@ Gem::Specification.new do |s|
66
74
  s.add_dependency(%q<nokogiri>, [">= 1.5.2"])
67
75
  s.add_dependency(%q<eventmachine>, [">= 1.0.0.beta.4"])
68
76
  s.add_dependency(%q<em-http-request>, [">= 1.0.2"])
69
- s.add_dependency(%q<shoulda>, [">= 0"])
70
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
77
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
78
+ s.add_dependency(%q<bundler>, ["~> 1.1.3"])
71
79
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
72
80
  s.add_dependency(%q<simplecov>, [">= 0"])
73
81
  end
@@ -0,0 +1,74 @@
1
+ require 'rubygems'
2
+ require 'bliss'
3
+
4
+ #path = 'http://www.universobit.com.ar/AvisosUniversobit/trovit/AvisosUniversobit_1.xml'
5
+ #path = 'http://www.aestrenar.com.ar/backend/rssAestrenar.xml'
6
+ #path = 'http://procarnet.es/feed/sumavisos/sumavisos.xml'
7
+ #path = 'http://taakidom.pl/import/trovit/trovit.xml'
8
+ #path = 'http://www.deautos.com/sumavisos/feed.xml'
9
+ #path = 'http://www.autocosmos.com.ar/webservices/exchange/sumavisos.ar.xml.gz'
10
+
11
+ #path = 'http://www.workgate.co.jp/feeds/sumavisos/sumavisos.xml'
12
+ #path = 'http://www.bydgoszczak.pl/export/trovit_praca'
13
+ #path = 'http://www.espacioinmobiliario.mx/feeds/feed.xml' # da timeout
14
+
15
+ #path = 'http://www.indexempleos.com/peru/jobs.xml'
16
+
17
+ #path = 'http://www.bydgoszczak.pl/export/trovit_praca'
18
+
19
+ path = 'http://www.tokkoro.com/cron/adsdeck_feed.xml'
20
+
21
+ # encoding
22
+ #path = 'http://www.topdiffusion.com/flux/topdiffusion_adsdeck.xml'
23
+ #path = 'http://www.workgate.co.jp/feeds/sumavisos/sumavisos.xml' # el problema es que viene con job, en lugar de ad
24
+ #path = 'http://www.ultramotors.com.br/trovit/'
25
+
26
+ #path = 'http://localhost:8080/maixon.xml'
27
+
28
+ #path = 'http://www.kasaki.com.br/somanuncios.xml' # uppercase
29
+ #path = 'http://canadajobsandcareers.ca/feed_adsdeck.php'
30
+
31
+ #path = 'http://www.goemploi.com/france/jobs.xml'
32
+ #path = 'http://www.monsieurjob.com/feeds/adsdeck.php'
33
+ #path = 'http://www.imovelajato.com.br/feeds/olx/olx.xml'
34
+
35
+ p = Bliss::ParserMachine.new(path, 'output.xml')
36
+ p.wait_tag_close('AD')
37
+ #p.max_unhandled_bytes = 20000
38
+
39
+ count = 0
40
+ p.on_root { |root|
41
+ #puts root
42
+ }
43
+ #p.on_tag_open { |depth|
44
+ # if depth.last =~ /[A-Z]/ then
45
+ # puts 'uppercase detected!'
46
+ # p.close
47
+ # end
48
+ #}
49
+ p.on_tag_open('AD') { |depth|
50
+ #puts depth.inspect
51
+ }
52
+ p.on_tag_close('AD') { |hash|
53
+ count += 1
54
+
55
+ dict = {"make"=>"name"}
56
+ only_in_dict = false
57
+ hash = hash.inject({}) { |h,v| key = dict.invert[v[0]]; key ||= v[0] unless only_in_dict; h[key] = v[1] if key; h }
58
+
59
+ #puts hash.inspect
60
+ #puts hash['type']
61
+ #puts hash.keys.inspect
62
+ #if count == 100
63
+ # p.close
64
+ #end
65
+ }
66
+
67
+ begin
68
+ p.parse
69
+ rescue Bliss::EncodingError
70
+ puts "resqued!"
71
+ end
72
+
73
+ puts "Root: #{p.root}"
74
+ puts "Count : #{count}"
@@ -0,0 +1,41 @@
1
+ require 'rubygems'
2
+ require 'eventmachine'
3
+ require 'em-http-request'
4
+
5
+ @bytes = 0
6
+ @io_read, @io_write = IO.pipe
7
+
8
+ require 'zlib'
9
+
10
+ f = File.new('test.xml', 'w')
11
+
12
+ EM.run do
13
+ http = EM::HttpRequest.new(url, :inactivity_timeout => 1).get # :head => {'accept-encoding' => "gzip, deflate"}
14
+ gzipped = false
15
+ http.headers do
16
+ puts http.response_header.inspect
17
+ if (/^attachment.+filename.+\.gz/i === http.response_header['CONTENT_DISPOSITION']) or http.response_header.compressed? or ["application/octet-stream", "application/x-gzip"].include? http.response_header['CONTENT_TYPE']
18
+ gzipped = true
19
+ end
20
+ end
21
+ http.stream { |chunk|
22
+ if @bytes > 15000
23
+ #f.close
24
+ EM.stop
25
+ else
26
+ if gzipped
27
+ @zstream ||= Zlib::Inflate.new(Zlib::MAX_WBITS+16)
28
+ chunk = @zstream.inflate(chunk)
29
+ end
30
+ puts chunk
31
+ @io_write << chunk
32
+ #f << chunk
33
+ @bytes += chunk.length
34
+ end
35
+ }
36
+ end
37
+ if @zstream
38
+ @zstream.close
39
+ end
40
+
41
+ puts @bytes
@@ -6,9 +6,7 @@ require 'em-http-request'
6
6
  @io_read, @io_write = IO.pipe
7
7
 
8
8
  EM.run do
9
- #url = 'http://www.universobit.com.ar/AvisosUniversobit/trovit/AvisosUniversobit_1.xml'
10
- #url = 'http://www.aestrenar.com.ar/backend/rssAestrenar.xml'
11
- url = 'http://procarnet.es/feed/sumavisos/sumavisos.xml'
9
+ url = ''
12
10
 
13
11
  http = EM::HttpRequest.new(url).get
14
12
  http.stream { |chunk|
@@ -17,6 +15,7 @@ EM.run do
17
15
  EM.stop
18
16
  else
19
17
  @io_write << chunk
18
+ puts chunk
20
19
  @bytes += chunk.length
21
20
  end
22
21
  }
@@ -1,9 +1,13 @@
1
1
  require 'nokogiri'
2
2
  require 'eventmachine'
3
3
  require 'em-http-request'
4
+ require 'zlib'
4
5
 
5
6
  require 'hash_extension'
6
7
 
7
- require 'bliss/sax_parser'
8
+ require 'bliss/constraint'
9
+ require 'bliss/format'
10
+
11
+ require 'bliss/encoding_error'
8
12
  require 'bliss/parser_machine'
9
13
  require 'bliss/parser'
@@ -0,0 +1,78 @@
1
+ module Bliss
2
+ class Constraint
3
+ #attr_reader :field, :type, :state
4
+ attr_reader :depth, :setting, :state
5
+
6
+ #TYPES = [:exist, :not_blank, :possible_values]
7
+
8
+ def initialize(depth, setting, params={})
9
+ @depth = depth
10
+ @setting = setting
11
+
12
+ @state = :not_checked
13
+ end
14
+
15
+ #def initialize(field, type, possible_values=nil)
16
+ # if field.is_a? Array
17
+ # @field = field
18
+ # else
19
+ # @field = [field]
20
+ # end
21
+ # @type = type
22
+ # @possible_values = possible_values
23
+ #
24
+ # @state = :not_checked
25
+ #end
26
+
27
+ def run!(hash)
28
+ @state = :not_checked
29
+ #@field.each do |field|
30
+ #if @state == :passed
31
+ # break
32
+ #end
33
+ case @setting
34
+ when :tag_name_required
35
+ if !hash.keys.include?(depth.last)
36
+ @state = :not_passed
37
+ else
38
+ @state = :passed
39
+ end
40
+ #when :not_blank
41
+ # if hash.has_key?(field) and !hash[field].to_s.empty?
42
+ # @state = :passed
43
+ # else
44
+ # @state = :not_passed
45
+ # end
46
+ #when :possible_values
47
+ # if hash.has_key?(field) and @possible_values.include?(hash[field])
48
+ # @state = :passed
49
+ # else
50
+ # @state = :not_passed
51
+ # end
52
+ end
53
+ #end
54
+ @state
55
+ end
56
+
57
+ def detail
58
+ if @state == :not_passed
59
+ detail = case @type
60
+ when :tag_name_required
61
+ [@field.join(" or "), "missing"]
62
+ #when :not_blank
63
+ # [@field.join(" or "), "blank"]
64
+ #when :possible_values
65
+ # [@field.join(" or "), "invalid"]
66
+ end
67
+ end
68
+ end
69
+
70
+ #def self.build_constraint(depth, setting, params={})#, field, type, possible_values=nil)
71
+ # constraints = []
72
+ #constraints.push Bliss::Constraint.new(field, :exist) if types.include?(:exist)
73
+ #constraints.push Bliss::Constraint.new(field, :not_blank) if types.include?(:not_blank)
74
+ #constraints.push BlissConstraint.new(field, :possible_values, possible_values) if types.include?(:possible_values)
75
+ # constraints
76
+ #end
77
+ end
78
+ end
@@ -0,0 +1,4 @@
1
+ module Bliss
2
+ class EncodingError < ::StandardError
3
+ end
4
+ end
@@ -0,0 +1,97 @@
1
+ require 'yaml'
2
+
3
+ module Bliss
4
+ class Format
5
+ @@keywords = %w{ tag_name_required content_required tag_name_type content_type tag_name_format content_format tag_name_values content_values }
6
+
7
+ def initialize
8
+ yml = YAML.load_file('/home/fernando/desarrollo/workspace/experimentos/bliss/spec.yml')
9
+ self.specifications = yml
10
+ end
11
+
12
+ # TODO for debugging only!
13
+ def keywords
14
+ @@keywords
15
+ end
16
+
17
+ def specifications=(specs={})
18
+ @specs = specs.dup
19
+ end
20
+ alias :specs= :specifications=
21
+
22
+ def constraints
23
+ return [] if not (@specs.is_a? Hash and @specs.size > 0)
24
+
25
+ constraints = []
26
+
27
+ @specs.recurse(true) do |depth, value|
28
+ if !@@keywords.include?(depth.last)
29
+ settings = @specs.value_at_chain(depth).select{|key| @@keywords.include?(key) }
30
+ end
31
+ if settings
32
+ settings.merge!({"tag_name_required" => true}) if not settings.has_key?("tag_name_required")
33
+
34
+ puts settings.inspect
35
+
36
+ # tag_name_required constraint:
37
+
38
+ settings.each_pair { |setting, value|
39
+ case setting
40
+ when "tag_name_required"
41
+ if value == true
42
+ constraints.push(Bliss::Constraint.new(depth, :tag_name_required))
43
+ end
44
+ end
45
+ }
46
+
47
+ #required_fields.each do |field|
48
+ # constraints.concat(Sumavisos::Parsers::Constraint.build_constraint(field, [:exist, :not_blank]).dup)
49
+ #end
50
+
51
+ ###
52
+
53
+ #puts "#{depth.join('/')}: #{settings.inspect}"
54
+ end
55
+ end
56
+
57
+ puts constraints.inspect
58
+
59
+ return constraints
60
+ end
61
+
62
+ # during parsing
63
+ # Sumavisos::Parsers::Validator.check_constraints(ad, constraints.select{|c| [:not_checked, :passed].include?(c.state)})
64
+
65
+ # @constraints.select{|c| c.state == :not_passed }.collect(&:detail)
66
+
67
+ def ad_constraints(root, vertical)
68
+ #required_fields = Sumavisos::Parsers::Validator::FIELDS['all']['required'].dup
69
+ #required_fields.concat(Sumavisos::Parsers::Validator::FIELDS[vertical]['required'])
70
+
71
+ #constraints = []
72
+ #required_fields.each do |field|
73
+ # constraints.concat(Sumavisos::Parsers::Constraint.build_constraint(field, [:exist, :not_blank]).dup)
74
+ #end
75
+
76
+ if vertical == 'property'
77
+ constraints.concat(Sumavisos::Parsers::Constraint.build_constraint(['type'], [:possible_values], Sumavisos::Parsers::Validator::VALID_PROPERTY_TYPES).dup)
78
+ end
79
+
80
+ constraints
81
+ end
82
+
83
+ def check_constraints(ads, constraints)
84
+ errors = []
85
+
86
+ ads = [ads] if not ads.is_a? Array
87
+
88
+ ads.each do |ad|
89
+ constraints.each do |constraint|
90
+ constraint.run!(ad)
91
+ end
92
+ end
93
+
94
+ return errors
95
+ end
96
+ end
97
+ end