bliss 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ == 0.0.9
2
+
3
+ * Features
4
+
5
+ * added support for Gzip (content type is autodetected).
6
+ * on_max_unhandled_bytes callback block. Which receives the amount of bytes and a block to execute when that limit is reached.
7
+ * on_tag_close maybe used without a specific tag name, and block now handles "current depth" array.
8
+ * Featuring changelog.
9
+
10
+ * Bugfixes
11
+
12
+ * When XML is too small, it ended through http.callback block, and did not closed the file. That lead to errors and is fixed in this version.
data/Gemfile CHANGED
@@ -10,8 +10,8 @@ gem "em-http-request", ">= 1.0.2"
10
10
  # Add dependencies to develop your gem here.
11
11
  # Include everything needed to run rake, tests, features, etc.
12
12
  group :development do
13
- gem "shoulda", ">= 0"
14
- gem "bundler", "~> 1.0.0"
13
+ gem "rspec", "~> 2.8.0"
14
+ gem "bundler", "~> 1.1.3"
15
15
  gem "jeweler", "~> 1.6.4"
16
16
  gem "simplecov"
17
17
  #gem "rcov", ">= 0"
@@ -3,6 +3,7 @@ GEM
3
3
  specs:
4
4
  addressable (2.2.7)
5
5
  cookiejar (0.3.0)
6
+ diff-lcs (1.1.3)
6
7
  em-http-request (1.0.2)
7
8
  addressable (>= 2.2.3)
8
9
  cookiejar
@@ -21,11 +22,14 @@ GEM
21
22
  multi_json (1.1.0)
22
23
  nokogiri (1.5.2)
23
24
  rake (0.9.2.2)
24
- shoulda (3.0.1)
25
- shoulda-context (~> 1.0.0)
26
- shoulda-matchers (~> 1.0.0)
27
- shoulda-context (1.0.0)
28
- shoulda-matchers (1.0.0)
25
+ rspec (2.8.0)
26
+ rspec-core (~> 2.8.0)
27
+ rspec-expectations (~> 2.8.0)
28
+ rspec-mocks (~> 2.8.0)
29
+ rspec-core (2.8.0)
30
+ rspec-expectations (2.8.0)
31
+ diff-lcs (~> 1.1.2)
32
+ rspec-mocks (2.8.0)
29
33
  simplecov (0.6.1)
30
34
  multi_json (~> 1.0)
31
35
  simplecov-html (~> 0.5.3)
@@ -35,10 +39,10 @@ PLATFORMS
35
39
  ruby
36
40
 
37
41
  DEPENDENCIES
38
- bundler (~> 1.0.0)
42
+ bundler (~> 1.1.3)
39
43
  em-http-request (>= 1.0.2)
40
44
  eventmachine (>= 1.0.0.beta.4)
41
45
  jeweler (~> 1.6.4)
42
46
  nokogiri (>= 1.5.2)
43
- shoulda
47
+ rspec (~> 2.8.0)
44
48
  simplecov
data/Rakefile CHANGED
@@ -26,6 +26,12 @@ Jeweler::Tasks.new do |gem|
26
26
  end
27
27
  Jeweler::RubygemsDotOrgTasks.new
28
28
 
29
+ require 'rspec/core'
30
+ require 'rspec/core/rake_task'
31
+ RSpec::Core::RakeTask.new(:spec) do |spec|
32
+ spec.pattern = FileList['spec/**/*_spec.rb']
33
+ end
34
+
29
35
  require 'rake/testtask'
30
36
  Rake::TestTask.new(:test) do |test|
31
37
  test.libs << 'lib' << 'test'
@@ -41,7 +47,7 @@ end
41
47
  # test.rcov_opts << '--exclude "gems/*"'
42
48
  #end
43
49
 
44
- task :default => :test
50
+ task :default => :spec
45
51
 
46
52
  require 'rake/rdoctask'
47
53
  Rake::RDocTask.new do |rdoc|
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.7
1
+ 0.0.9
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "bliss"
8
- s.version = "0.0.7"
8
+ s.version = "0.0.9"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Fernando Alonso"]
12
- s.date = "2012-03-21"
12
+ s.date = "2012-06-04"
13
13
  s.description = "streamed xml parsing tool"
14
14
  s.email = "krakatoa1987@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -18,6 +18,7 @@ Gem::Specification.new do |s|
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
21
+ "CHANGELOG.rdoc",
21
22
  "Gemfile",
22
23
  "Gemfile.lock",
23
24
  "LICENSE.txt",
@@ -25,13 +26,20 @@ Gem::Specification.new do |s|
25
26
  "Rakefile",
26
27
  "VERSION",
27
28
  "bliss.gemspec",
29
+ "complete_test.rb",
30
+ "gzip_support.rb",
28
31
  "hash.rb",
29
32
  "http-machine.rb",
30
33
  "lib/bliss.rb",
34
+ "lib/bliss/constraint.rb",
35
+ "lib/bliss/encoding_error.rb",
36
+ "lib/bliss/format.rb",
31
37
  "lib/bliss/parser.rb",
32
38
  "lib/bliss/parser_machine.rb",
33
- "lib/bliss/sax_parser.rb",
34
39
  "lib/hash_extension.rb",
40
+ "spec.yml",
41
+ "spec/format_spec.rb",
42
+ "spec/spec_helper.rb",
35
43
  "test.rb",
36
44
  "test/helper.rb",
37
45
  "test/test_bliss.rb"
@@ -39,7 +47,7 @@ Gem::Specification.new do |s|
39
47
  s.homepage = "http://github.com/krakatoa/bliss"
40
48
  s.licenses = ["MIT"]
41
49
  s.require_paths = ["lib"]
42
- s.rubygems_version = "1.8.15"
50
+ s.rubygems_version = "1.8.10"
43
51
  s.summary = "streamed xml parsing tool"
44
52
 
45
53
  if s.respond_to? :specification_version then
@@ -49,16 +57,16 @@ Gem::Specification.new do |s|
49
57
  s.add_runtime_dependency(%q<nokogiri>, [">= 1.5.2"])
50
58
  s.add_runtime_dependency(%q<eventmachine>, [">= 1.0.0.beta.4"])
51
59
  s.add_runtime_dependency(%q<em-http-request>, [">= 1.0.2"])
52
- s.add_development_dependency(%q<shoulda>, [">= 0"])
53
- s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
60
+ s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
61
+ s.add_development_dependency(%q<bundler>, ["~> 1.1.3"])
54
62
  s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
55
63
  s.add_development_dependency(%q<simplecov>, [">= 0"])
56
64
  else
57
65
  s.add_dependency(%q<nokogiri>, [">= 1.5.2"])
58
66
  s.add_dependency(%q<eventmachine>, [">= 1.0.0.beta.4"])
59
67
  s.add_dependency(%q<em-http-request>, [">= 1.0.2"])
60
- s.add_dependency(%q<shoulda>, [">= 0"])
61
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
68
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
69
+ s.add_dependency(%q<bundler>, ["~> 1.1.3"])
62
70
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
63
71
  s.add_dependency(%q<simplecov>, [">= 0"])
64
72
  end
@@ -66,8 +74,8 @@ Gem::Specification.new do |s|
66
74
  s.add_dependency(%q<nokogiri>, [">= 1.5.2"])
67
75
  s.add_dependency(%q<eventmachine>, [">= 1.0.0.beta.4"])
68
76
  s.add_dependency(%q<em-http-request>, [">= 1.0.2"])
69
- s.add_dependency(%q<shoulda>, [">= 0"])
70
- s.add_dependency(%q<bundler>, ["~> 1.0.0"])
77
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
78
+ s.add_dependency(%q<bundler>, ["~> 1.1.3"])
71
79
  s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
72
80
  s.add_dependency(%q<simplecov>, [">= 0"])
73
81
  end
@@ -0,0 +1,74 @@
1
+ require 'rubygems'
2
+ require 'bliss'
3
+
4
+ #path = 'http://www.universobit.com.ar/AvisosUniversobit/trovit/AvisosUniversobit_1.xml'
5
+ #path = 'http://www.aestrenar.com.ar/backend/rssAestrenar.xml'
6
+ #path = 'http://procarnet.es/feed/sumavisos/sumavisos.xml'
7
+ #path = 'http://taakidom.pl/import/trovit/trovit.xml'
8
+ #path = 'http://www.deautos.com/sumavisos/feed.xml'
9
+ #path = 'http://www.autocosmos.com.ar/webservices/exchange/sumavisos.ar.xml.gz'
10
+
11
+ #path = 'http://www.workgate.co.jp/feeds/sumavisos/sumavisos.xml'
12
+ #path = 'http://www.bydgoszczak.pl/export/trovit_praca'
13
+ #path = 'http://www.espacioinmobiliario.mx/feeds/feed.xml' # da timeout
14
+
15
+ #path = 'http://www.indexempleos.com/peru/jobs.xml'
16
+
17
+ #path = 'http://www.bydgoszczak.pl/export/trovit_praca'
18
+
19
+ path = 'http://www.tokkoro.com/cron/adsdeck_feed.xml'
20
+
21
+ # encoding
22
+ #path = 'http://www.topdiffusion.com/flux/topdiffusion_adsdeck.xml'
23
+ #path = 'http://www.workgate.co.jp/feeds/sumavisos/sumavisos.xml' # el problema es que viene con job, en lugar de ad
24
+ #path = 'http://www.ultramotors.com.br/trovit/'
25
+
26
+ #path = 'http://localhost:8080/maixon.xml'
27
+
28
+ #path = 'http://www.kasaki.com.br/somanuncios.xml' # uppercase
29
+ #path = 'http://canadajobsandcareers.ca/feed_adsdeck.php'
30
+
31
+ #path = 'http://www.goemploi.com/france/jobs.xml'
32
+ #path = 'http://www.monsieurjob.com/feeds/adsdeck.php'
33
+ #path = 'http://www.imovelajato.com.br/feeds/olx/olx.xml'
34
+
35
+ p = Bliss::ParserMachine.new(path, 'output.xml')
36
+ p.wait_tag_close('AD')
37
+ #p.max_unhandled_bytes = 20000
38
+
39
+ count = 0
40
+ p.on_root { |root|
41
+ #puts root
42
+ }
43
+ #p.on_tag_open { |depth|
44
+ # if depth.last =~ /[A-Z]/ then
45
+ # puts 'uppercase detected!'
46
+ # p.close
47
+ # end
48
+ #}
49
+ p.on_tag_open('AD') { |depth|
50
+ #puts depth.inspect
51
+ }
52
+ p.on_tag_close('AD') { |hash|
53
+ count += 1
54
+
55
+ dict = {"make"=>"name"}
56
+ only_in_dict = false
57
+ hash = hash.inject({}) { |h,v| key = dict.invert[v[0]]; key ||= v[0] unless only_in_dict; h[key] = v[1] if key; h }
58
+
59
+ #puts hash.inspect
60
+ #puts hash['type']
61
+ #puts hash.keys.inspect
62
+ #if count == 100
63
+ # p.close
64
+ #end
65
+ }
66
+
67
+ begin
68
+ p.parse
69
+ rescue Bliss::EncodingError
70
+ puts "resqued!"
71
+ end
72
+
73
+ puts "Root: #{p.root}"
74
+ puts "Count : #{count}"
@@ -0,0 +1,41 @@
1
+ require 'rubygems'
2
+ require 'eventmachine'
3
+ require 'em-http-request'
4
+
5
+ @bytes = 0
6
+ @io_read, @io_write = IO.pipe
7
+
8
+ require 'zlib'
9
+
10
+ f = File.new('test.xml', 'w')
11
+
12
+ EM.run do
13
+ http = EM::HttpRequest.new(url, :inactivity_timeout => 1).get # :head => {'accept-encoding' => "gzip, deflate"}
14
+ gzipped = false
15
+ http.headers do
16
+ puts http.response_header.inspect
17
+ if (/^attachment.+filename.+\.gz/i === http.response_header['CONTENT_DISPOSITION']) or http.response_header.compressed? or ["application/octet-stream", "application/x-gzip"].include? http.response_header['CONTENT_TYPE']
18
+ gzipped = true
19
+ end
20
+ end
21
+ http.stream { |chunk|
22
+ if @bytes > 15000
23
+ #f.close
24
+ EM.stop
25
+ else
26
+ if gzipped
27
+ @zstream ||= Zlib::Inflate.new(Zlib::MAX_WBITS+16)
28
+ chunk = @zstream.inflate(chunk)
29
+ end
30
+ puts chunk
31
+ @io_write << chunk
32
+ #f << chunk
33
+ @bytes += chunk.length
34
+ end
35
+ }
36
+ end
37
+ if @zstream
38
+ @zstream.close
39
+ end
40
+
41
+ puts @bytes
@@ -6,9 +6,7 @@ require 'em-http-request'
6
6
  @io_read, @io_write = IO.pipe
7
7
 
8
8
  EM.run do
9
- #url = 'http://www.universobit.com.ar/AvisosUniversobit/trovit/AvisosUniversobit_1.xml'
10
- #url = 'http://www.aestrenar.com.ar/backend/rssAestrenar.xml'
11
- url = 'http://procarnet.es/feed/sumavisos/sumavisos.xml'
9
+ url = ''
12
10
 
13
11
  http = EM::HttpRequest.new(url).get
14
12
  http.stream { |chunk|
@@ -17,6 +15,7 @@ EM.run do
17
15
  EM.stop
18
16
  else
19
17
  @io_write << chunk
18
+ puts chunk
20
19
  @bytes += chunk.length
21
20
  end
22
21
  }
@@ -1,9 +1,13 @@
1
1
  require 'nokogiri'
2
2
  require 'eventmachine'
3
3
  require 'em-http-request'
4
+ require 'zlib'
4
5
 
5
6
  require 'hash_extension'
6
7
 
7
- require 'bliss/sax_parser'
8
+ require 'bliss/constraint'
9
+ require 'bliss/format'
10
+
11
+ require 'bliss/encoding_error'
8
12
  require 'bliss/parser_machine'
9
13
  require 'bliss/parser'
@@ -0,0 +1,78 @@
1
+ module Bliss
2
+ class Constraint
3
+ #attr_reader :field, :type, :state
4
+ attr_reader :depth, :setting, :state
5
+
6
+ #TYPES = [:exist, :not_blank, :possible_values]
7
+
8
+ def initialize(depth, setting, params={})
9
+ @depth = depth
10
+ @setting = setting
11
+
12
+ @state = :not_checked
13
+ end
14
+
15
+ #def initialize(field, type, possible_values=nil)
16
+ # if field.is_a? Array
17
+ # @field = field
18
+ # else
19
+ # @field = [field]
20
+ # end
21
+ # @type = type
22
+ # @possible_values = possible_values
23
+ #
24
+ # @state = :not_checked
25
+ #end
26
+
27
+ def run!(hash)
28
+ @state = :not_checked
29
+ #@field.each do |field|
30
+ #if @state == :passed
31
+ # break
32
+ #end
33
+ case @setting
34
+ when :tag_name_required
35
+ if !hash.keys.include?(depth.last)
36
+ @state = :not_passed
37
+ else
38
+ @state = :passed
39
+ end
40
+ #when :not_blank
41
+ # if hash.has_key?(field) and !hash[field].to_s.empty?
42
+ # @state = :passed
43
+ # else
44
+ # @state = :not_passed
45
+ # end
46
+ #when :possible_values
47
+ # if hash.has_key?(field) and @possible_values.include?(hash[field])
48
+ # @state = :passed
49
+ # else
50
+ # @state = :not_passed
51
+ # end
52
+ end
53
+ #end
54
+ @state
55
+ end
56
+
57
+ def detail
58
+ if @state == :not_passed
59
+ detail = case @type
60
+ when :tag_name_required
61
+ [@field.join(" or "), "missing"]
62
+ #when :not_blank
63
+ # [@field.join(" or "), "blank"]
64
+ #when :possible_values
65
+ # [@field.join(" or "), "invalid"]
66
+ end
67
+ end
68
+ end
69
+
70
+ #def self.build_constraint(depth, setting, params={})#, field, type, possible_values=nil)
71
+ # constraints = []
72
+ #constraints.push Bliss::Constraint.new(field, :exist) if types.include?(:exist)
73
+ #constraints.push Bliss::Constraint.new(field, :not_blank) if types.include?(:not_blank)
74
+ #constraints.push BlissConstraint.new(field, :possible_values, possible_values) if types.include?(:possible_values)
75
+ # constraints
76
+ #end
77
+ end
78
+ end
@@ -0,0 +1,4 @@
1
+ module Bliss
2
+ class EncodingError < ::StandardError
3
+ end
4
+ end
@@ -0,0 +1,97 @@
1
+ require 'yaml'
2
+
3
+ module Bliss
4
+ class Format
5
+ @@keywords = %w{ tag_name_required content_required tag_name_type content_type tag_name_format content_format tag_name_values content_values }
6
+
7
+ def initialize
8
+ yml = YAML.load_file('/home/fernando/desarrollo/workspace/experimentos/bliss/spec.yml')
9
+ self.specifications = yml
10
+ end
11
+
12
+ # TODO for debugging only!
13
+ def keywords
14
+ @@keywords
15
+ end
16
+
17
+ def specifications=(specs={})
18
+ @specs = specs.dup
19
+ end
20
+ alias :specs= :specifications=
21
+
22
+ def constraints
23
+ return [] if not (@specs.is_a? Hash and @specs.size > 0)
24
+
25
+ constraints = []
26
+
27
+ @specs.recurse(true) do |depth, value|
28
+ if !@@keywords.include?(depth.last)
29
+ settings = @specs.value_at_chain(depth).select{|key| @@keywords.include?(key) }
30
+ end
31
+ if settings
32
+ settings.merge!({"tag_name_required" => true}) if not settings.has_key?("tag_name_required")
33
+
34
+ puts settings.inspect
35
+
36
+ # tag_name_required constraint:
37
+
38
+ settings.each_pair { |setting, value|
39
+ case setting
40
+ when "tag_name_required"
41
+ if value == true
42
+ constraints.push(Bliss::Constraint.new(depth, :tag_name_required))
43
+ end
44
+ end
45
+ }
46
+
47
+ #required_fields.each do |field|
48
+ # constraints.concat(Sumavisos::Parsers::Constraint.build_constraint(field, [:exist, :not_blank]).dup)
49
+ #end
50
+
51
+ ###
52
+
53
+ #puts "#{depth.join('/')}: #{settings.inspect}"
54
+ end
55
+ end
56
+
57
+ puts constraints.inspect
58
+
59
+ return constraints
60
+ end
61
+
62
+ # during parsing
63
+ # Sumavisos::Parsers::Validator.check_constraints(ad, constraints.select{|c| [:not_checked, :passed].include?(c.state)})
64
+
65
+ # @constraints.select{|c| c.state == :not_passed }.collect(&:detail)
66
+
67
+ def ad_constraints(root, vertical)
68
+ #required_fields = Sumavisos::Parsers::Validator::FIELDS['all']['required'].dup
69
+ #required_fields.concat(Sumavisos::Parsers::Validator::FIELDS[vertical]['required'])
70
+
71
+ #constraints = []
72
+ #required_fields.each do |field|
73
+ # constraints.concat(Sumavisos::Parsers::Constraint.build_constraint(field, [:exist, :not_blank]).dup)
74
+ #end
75
+
76
+ if vertical == 'property'
77
+ constraints.concat(Sumavisos::Parsers::Constraint.build_constraint(['type'], [:possible_values], Sumavisos::Parsers::Validator::VALID_PROPERTY_TYPES).dup)
78
+ end
79
+
80
+ constraints
81
+ end
82
+
83
+ def check_constraints(ads, constraints)
84
+ errors = []
85
+
86
+ ads = [ads] if not ads.is_a? Array
87
+
88
+ ads.each do |ad|
89
+ constraints.each do |constraint|
90
+ constraint.run!(ad)
91
+ end
92
+ end
93
+
94
+ return errors
95
+ end
96
+ end
97
+ end