crabfarm 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: effa8e1c5802af234c1fa733db6c2e1adea6ee5b
4
- data.tar.gz: b988a4ddc01841d41cafd4225942820fd8a10108
3
+ metadata.gz: fff9d93bc19a725f5fd02a5460ceafe44c3c7694
4
+ data.tar.gz: 46a109065b2018ed7a1a1bac296d0c53fba4e6d5
5
5
  SHA512:
6
- metadata.gz: 43b9f2b0a3157ca68addadc895fc77de7f0c025205095b4ecc3027630bb71a8994dc84c5b0de32f1a135f2a10c5fcfc4e9cf1220307f95e0a008ad77e4c01383
7
- data.tar.gz: 01863c91b81858759875f6d12890c9c3c2a1a846618df9dbf81c2a3632f5174018adcc2a2b13cc1bd4d10d8b0f275a0ffed900e0b970409ca5d690b6a548acb8
6
+ metadata.gz: d756345e28af5f03a2a5bb9404035c6262ca0e192606939e8d8daa5968fca2a3c6c9faab52d3a81c4bf4bcf54579a5d7d031af12c86016b9ec3461ef17edd6f6
7
+ data.tar.gz: 79f8db6736962460721927082c49d9060aebcdfdc8b8773101427a113ee1b0e08c8065cbbd7715afc72d9e70af02adf10f6bd436e3af526ef85b3655c1a84ca3
@@ -2,11 +2,19 @@ require 'nokogiri'
2
2
 
3
3
  module Crabfarm
4
4
  class NokogiriAdapter
5
- def self.parse(_element)
6
- if _element.respond_to? :to_html
7
- Nokogiri::HTML _element.to_html
5
+ def self.format
6
+ 'html'
7
+ end
8
+
9
+ def self.parse(_raw)
10
+ Nokogiri::HTML _raw
11
+ end
12
+
13
+ def self.preprocess_parsing_target(_target)
14
+ if _target.respond_to? :to_html
15
+ _target.to_html
8
16
  else
9
- Nokogiri::HTML _element
17
+ _target
10
18
  end
11
19
  end
12
20
  end
@@ -2,8 +2,16 @@ require "pdf-reader"
2
2
 
3
3
  module Crabfarm
4
4
  class PdfReaderAdapter
5
- def self.parse(_pdf_data)
6
- PDF::Reader.new StringIO.new _pdf_data
5
+ def self.format
6
+ 'pdf'
7
+ end
8
+
9
+ def self.parse(_raw)
10
+ PDF::Reader.new StringIO.new _raw
11
+ end
12
+
13
+ def self.preprocess_parsing_target(_target)
14
+ _target
7
15
  end
8
16
  end
9
17
  end
@@ -6,13 +6,26 @@ module Crabfarm
6
6
 
7
7
  attr_reader :params, :document
8
8
 
9
- def self.engine(_engine)
10
- @engine = _engine
9
+ def self.parser_engine(_engine=nil)
10
+ @engine_name = _engine
11
+ end
12
+
13
+ def self.engine
14
+ @engine ||= Strategies.load(:parser_engine, @engine_name || Crabfarm.config.parser_engine)
15
+ end
16
+
17
+ def self.snapshot_path(_name=nil)
18
+ _name = self.to_s.underscore if _name.nil?
19
+ File.join(GlobalState.snapshots_path, _name + '.' + engine.format)
20
+ end
21
+
22
+ def engine
23
+ self.class.engine
11
24
  end
12
25
 
13
26
  def initialize(_target, _params)
14
- engine_class = Strategies.load(:parser_engine, class_engine || Crabfarm.config.parser_engine)
15
- @document = engine_class.parse _target
27
+ @parsed_data = engine.preprocess_parsing_target _target
28
+ @document = engine.parse @parsed_data
16
29
  @params = _params
17
30
 
18
31
  super @document
@@ -22,6 +35,18 @@ module Crabfarm
22
35
  raise NotImplementedError.new
23
36
  end
24
37
 
38
+ def take_snapshot(_name=nil)
39
+ file_path = self.class.snapshot_path _name
40
+
41
+ raise ArgumentError.new "Snapshot already exists '#{file_path}', make sure to implement the #{self.class.to_s} parse method." if File.exist? file_path
42
+
43
+ dir_path = file_path.split(File::SEPARATOR)[0...-1]
44
+ FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
45
+
46
+ File.write file_path, @parsed_data
47
+ nil
48
+ end
49
+
25
50
  def __getobj__
26
51
  @document
27
52
  end
@@ -30,10 +55,5 @@ module Crabfarm
30
55
  @document = obj
31
56
  end
32
57
 
33
- private
34
-
35
- def class_engine
36
- self.class.instance_variable_get :@engine
37
- end
38
58
  end
39
59
  end
@@ -115,7 +115,7 @@ module Crabfarm
115
115
  next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
116
116
 
117
117
  require "crabfarm/modes/recorder"
118
- Crabfarm::Modes::Recorder.start GlobalState.memento_path(args[0]), options[:playback]
118
+ Crabfarm::Modes::Recorder.start args[0], options[:playback]
119
119
  end
120
120
  end
121
121
 
@@ -21,7 +21,8 @@ module Crabfarm
21
21
 
22
22
  def load_crabtrap_context(_memento)
23
23
  require 'crabfarm/crabtrap_context'
24
- m_path = GlobalState.memento_path _memento
24
+ require 'crabfarm/modes/recorder'
25
+ m_path = Modes::Recorder.memento_path _memento
25
26
  raise ResourceNotFoundError.new "Could not find memento '#{_name}'" unless File.exists? m_path
26
27
  Crabfarm::CrabtrapContext.new :replay, m_path
27
28
  end
@@ -9,14 +9,12 @@ module Crabfarm
9
9
  CF_PATH
10
10
  end
11
11
 
12
- def memento_path(_name)
13
- return nil if _name.nil?
14
- File.join(app_path, 'spec/mementos', _name + '.json.gz')
12
+ def mementos_path
13
+ File.join(app_path, 'spec/mementos')
15
14
  end
16
15
 
17
- def snapshot_path(_file_name)
18
- return nil if _file_name.nil?
19
- File.join(app_path, 'spec/snapshots', _file_name)
16
+ def snapshots_path
17
+ File.join app_path, 'spec/snapshots'
20
18
  end
21
19
 
22
20
  extend self
@@ -6,14 +6,20 @@ module Crabfarm
6
6
  module Modes
7
7
  module Recorder
8
8
 
9
+ def self.memento_path(_name)
10
+ File.join(GlobalState.mementos_path, _name + '.json.gz')
11
+ end
12
+
9
13
  def self.start(_target, _replay=false)
10
14
  return puts "Must provide a recording target" unless _target.is_a? String
11
- return puts "Memento file does not exist: #{_target}" if _replay and not File.exist? _target
15
+
16
+ target_path = memento_path _target
17
+ return puts "Memento file does not exist: #{target_path}" if _replay and not File.exist? target_path
12
18
 
13
19
  crabtrap_config = Crabfarm.config.crabtrap_config
14
20
  crabtrap_config[:mode] = _replay ? :replay : :capture
15
21
  crabtrap_config[:port] = Utils::PortDiscovery.find_available_port
16
- crabtrap_config[:bucket_path] = _target
22
+ crabtrap_config[:bucket_path] = target_path
17
23
 
18
24
  crabtrap = CrabtrapRunner.new crabtrap_config
19
25
  crabtrap.start
@@ -3,12 +3,21 @@ module Crabfarm
3
3
 
4
4
  class Error < Crabfarm::Error; end
5
5
 
6
- def parse(_snapshot, _options={})
7
- snapshot_path = GlobalState.snapshot_path _snapshot
6
+ def parse(_snapshot=nil, _options={})
7
+
8
+ raise Error.new "Crawl is only available in parser specs" unless described_class < Crabfarm::BaseParser
9
+
10
+ if _snapshot.is_a? Hash
11
+ raise ArgumentException.new 'Invalid arguments' unless _options.nil?
12
+ _options = _snapshot
13
+ _snapshot = nil
14
+ end
15
+
16
+ snapshot_path = described_class.snapshot_path _snapshot
8
17
  raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
9
18
 
10
- html = File.read snapshot_path
11
- parser = described_class.new html, _options
19
+ data = File.read snapshot_path
20
+ parser = described_class.new data, _options
12
21
  parser.parse
13
22
  parser
14
23
  end
@@ -39,7 +48,7 @@ module Crabfarm
39
48
  end
40
49
 
41
50
  def parser
42
- @parser
51
+ @parser ||= parse
43
52
  end
44
53
 
45
54
  def driver(_session_id=nil)
@@ -54,8 +63,8 @@ RSpec.configure do |config|
54
63
 
55
64
  config.around(:example) do |example|
56
65
  if described_class < Crabfarm::BaseParser
57
- if example.metadata[:parsing]
58
- @parser = parse example.metadata[:parsing], example.metadata[:using] || {}
66
+ if example.metadata[:parsing] || example[:parsing_with_params]
67
+ @parser = parse example.metadata[:parsing], example.metadata[:parsing_with_params] || {}
59
68
  end
60
69
  example.run
61
70
  elsif described_class < Crabfarm::BaseState
@@ -1,7 +1,9 @@
1
1
  class <%= parser_class %> < Crabfarm::BaseParser
2
2
 
3
3
  def parse
4
- # Do some damage!
4
+ # You can replace the following line after running the owner state specs once.
5
+ # Take a look at the 'Testing' section of the README.md for more information!
6
+ take_snapshot
5
7
  end
6
8
 
7
9
  end
@@ -1,3 +1,3 @@
1
1
  module Crabfarm
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crabfarm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ignacio Baixas