crabfarm 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/parser/nokogiri.rb +12 -4
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +10 -2
- data/lib/crabfarm/base_parser.rb +29 -9
- data/lib/crabfarm/cli.rb +1 -1
- data/lib/crabfarm/context_factory.rb +2 -1
- data/lib/crabfarm/global_state.rb +4 -6
- data/lib/crabfarm/modes/recorder.rb +8 -2
- data/lib/crabfarm/rspec.rb +16 -7
- data/lib/crabfarm/templates/parser.rb.erb +3 -1
- data/lib/crabfarm/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fff9d93bc19a725f5fd02a5460ceafe44c3c7694
|
4
|
+
data.tar.gz: 46a109065b2018ed7a1a1bac296d0c53fba4e6d5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d756345e28af5f03a2a5bb9404035c6262ca0e192606939e8d8daa5968fca2a3c6c9faab52d3a81c4bf4bcf54579a5d7d031af12c86016b9ec3461ef17edd6f6
|
7
|
+
data.tar.gz: 79f8db6736962460721927082c49d9060aebcdfdc8b8773101427a113ee1b0e08c8065cbbd7715afc72d9e70af02adf10f6bd436e3af526ef85b3655c1a84ca3
|
@@ -2,11 +2,19 @@ require 'nokogiri'
|
|
2
2
|
|
3
3
|
module Crabfarm
|
4
4
|
class NokogiriAdapter
|
5
|
-
def self.
|
6
|
-
|
7
|
-
|
5
|
+
def self.format
|
6
|
+
'html'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(_raw)
|
10
|
+
Nokogiri::HTML _raw
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.preprocess_parsing_target(_target)
|
14
|
+
if _target.respond_to? :to_html
|
15
|
+
_target.to_html
|
8
16
|
else
|
9
|
-
|
17
|
+
_target
|
10
18
|
end
|
11
19
|
end
|
12
20
|
end
|
@@ -2,8 +2,16 @@ require "pdf-reader"
|
|
2
2
|
|
3
3
|
module Crabfarm
|
4
4
|
class PdfReaderAdapter
|
5
|
-
def self.
|
6
|
-
|
5
|
+
def self.format
|
6
|
+
'pdf'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(_raw)
|
10
|
+
PDF::Reader.new StringIO.new _raw
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.preprocess_parsing_target(_target)
|
14
|
+
_target
|
7
15
|
end
|
8
16
|
end
|
9
17
|
end
|
data/lib/crabfarm/base_parser.rb
CHANGED
@@ -6,13 +6,26 @@ module Crabfarm
|
|
6
6
|
|
7
7
|
attr_reader :params, :document
|
8
8
|
|
9
|
-
def self.
|
10
|
-
@
|
9
|
+
def self.parser_engine(_engine=nil)
|
10
|
+
@engine_name = _engine
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.engine
|
14
|
+
@engine ||= Strategies.load(:parser_engine, @engine_name || Crabfarm.config.parser_engine)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.snapshot_path(_name=nil)
|
18
|
+
_name = self.to_s.underscore if _name.nil?
|
19
|
+
File.join(GlobalState.snapshots_path, _name + '.' + engine.format)
|
20
|
+
end
|
21
|
+
|
22
|
+
def engine
|
23
|
+
self.class.engine
|
11
24
|
end
|
12
25
|
|
13
26
|
def initialize(_target, _params)
|
14
|
-
|
15
|
-
@document =
|
27
|
+
@parsed_data = engine.preprocess_parsing_target _target
|
28
|
+
@document = engine.parse @parsed_data
|
16
29
|
@params = _params
|
17
30
|
|
18
31
|
super @document
|
@@ -22,6 +35,18 @@ module Crabfarm
|
|
22
35
|
raise NotImplementedError.new
|
23
36
|
end
|
24
37
|
|
38
|
+
def take_snapshot(_name=nil)
|
39
|
+
file_path = self.class.snapshot_path _name
|
40
|
+
|
41
|
+
raise ArgumentError.new "Snapshot already exists '#{file_path}', make sure to implement the #{self.class.to_s} parse method." if File.exist? file_path
|
42
|
+
|
43
|
+
dir_path = file_path.split(File::SEPARATOR)[0...-1]
|
44
|
+
FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
|
45
|
+
|
46
|
+
File.write file_path, @parsed_data
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
25
50
|
def __getobj__
|
26
51
|
@document
|
27
52
|
end
|
@@ -30,10 +55,5 @@ module Crabfarm
|
|
30
55
|
@document = obj
|
31
56
|
end
|
32
57
|
|
33
|
-
private
|
34
|
-
|
35
|
-
def class_engine
|
36
|
-
self.class.instance_variable_get :@engine
|
37
|
-
end
|
38
58
|
end
|
39
59
|
end
|
data/lib/crabfarm/cli.rb
CHANGED
@@ -115,7 +115,7 @@ module Crabfarm
|
|
115
115
|
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
116
116
|
|
117
117
|
require "crabfarm/modes/recorder"
|
118
|
-
Crabfarm::Modes::Recorder.start
|
118
|
+
Crabfarm::Modes::Recorder.start args[0], options[:playback]
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
@@ -21,7 +21,8 @@ module Crabfarm
|
|
21
21
|
|
22
22
|
def load_crabtrap_context(_memento)
|
23
23
|
require 'crabfarm/crabtrap_context'
|
24
|
-
|
24
|
+
require 'crabfarm/modes/recorder'
|
25
|
+
m_path = Modes::Recorder.memento_path _memento
|
25
26
|
raise ResourceNotFoundError.new "Could not find memento '#{_name}'" unless File.exists? m_path
|
26
27
|
Crabfarm::CrabtrapContext.new :replay, m_path
|
27
28
|
end
|
@@ -9,14 +9,12 @@ module Crabfarm
|
|
9
9
|
CF_PATH
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
13
|
-
|
14
|
-
File.join(app_path, 'spec/mementos', _name + '.json.gz')
|
12
|
+
def mementos_path
|
13
|
+
File.join(app_path, 'spec/mementos')
|
15
14
|
end
|
16
15
|
|
17
|
-
def
|
18
|
-
|
19
|
-
File.join(app_path, 'spec/snapshots', _file_name)
|
16
|
+
def snapshots_path
|
17
|
+
File.join app_path, 'spec/snapshots'
|
20
18
|
end
|
21
19
|
|
22
20
|
extend self
|
@@ -6,14 +6,20 @@ module Crabfarm
|
|
6
6
|
module Modes
|
7
7
|
module Recorder
|
8
8
|
|
9
|
+
def self.memento_path(_name)
|
10
|
+
File.join(GlobalState.mementos_path, _name + '.json.gz')
|
11
|
+
end
|
12
|
+
|
9
13
|
def self.start(_target, _replay=false)
|
10
14
|
return puts "Must provide a recording target" unless _target.is_a? String
|
11
|
-
|
15
|
+
|
16
|
+
target_path = memento_path _target
|
17
|
+
return puts "Memento file does not exist: #{target_path}" if _replay and not File.exist? target_path
|
12
18
|
|
13
19
|
crabtrap_config = Crabfarm.config.crabtrap_config
|
14
20
|
crabtrap_config[:mode] = _replay ? :replay : :capture
|
15
21
|
crabtrap_config[:port] = Utils::PortDiscovery.find_available_port
|
16
|
-
crabtrap_config[:bucket_path] =
|
22
|
+
crabtrap_config[:bucket_path] = target_path
|
17
23
|
|
18
24
|
crabtrap = CrabtrapRunner.new crabtrap_config
|
19
25
|
crabtrap.start
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -3,12 +3,21 @@ module Crabfarm
|
|
3
3
|
|
4
4
|
class Error < Crabfarm::Error; end
|
5
5
|
|
6
|
-
def parse(_snapshot, _options={})
|
7
|
-
|
6
|
+
def parse(_snapshot=nil, _options={})
|
7
|
+
|
8
|
+
raise Error.new "Crawl is only available in parser specs" unless described_class < Crabfarm::BaseParser
|
9
|
+
|
10
|
+
if _snapshot.is_a? Hash
|
11
|
+
raise ArgumentException.new 'Invalid arguments' unless _options.nil?
|
12
|
+
_options = _snapshot
|
13
|
+
_snapshot = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
snapshot_path = described_class.snapshot_path _snapshot
|
8
17
|
raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
|
9
18
|
|
10
|
-
|
11
|
-
parser = described_class.new
|
19
|
+
data = File.read snapshot_path
|
20
|
+
parser = described_class.new data, _options
|
12
21
|
parser.parse
|
13
22
|
parser
|
14
23
|
end
|
@@ -39,7 +48,7 @@ module Crabfarm
|
|
39
48
|
end
|
40
49
|
|
41
50
|
def parser
|
42
|
-
@parser
|
51
|
+
@parser ||= parse
|
43
52
|
end
|
44
53
|
|
45
54
|
def driver(_session_id=nil)
|
@@ -54,8 +63,8 @@ RSpec.configure do |config|
|
|
54
63
|
|
55
64
|
config.around(:example) do |example|
|
56
65
|
if described_class < Crabfarm::BaseParser
|
57
|
-
if example.metadata[:parsing]
|
58
|
-
@parser = parse example.metadata[:parsing], example.metadata[:
|
66
|
+
if example.metadata[:parsing] || example[:parsing_with_params]
|
67
|
+
@parser = parse example.metadata[:parsing], example.metadata[:parsing_with_params] || {}
|
59
68
|
end
|
60
69
|
example.run
|
61
70
|
elsif described_class < Crabfarm::BaseState
|
@@ -1,7 +1,9 @@
|
|
1
1
|
class <%= parser_class %> < Crabfarm::BaseParser
|
2
2
|
|
3
3
|
def parse
|
4
|
-
#
|
4
|
+
# You can replace the following line after running the owner state specs once.
|
5
|
+
# Take a look at the 'Testing' section of the README.md for more information!
|
6
|
+
take_snapshot
|
5
7
|
end
|
6
8
|
|
7
9
|
end
|
data/lib/crabfarm/version.rb
CHANGED