sinew 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,126 @@
1
+ require 'csv'
2
+ require 'stringex'
3
+
4
+ #
5
+ # CSV output.
6
+ #
7
+
8
+ module Sinew
9
+ class Output
10
+ attr_reader :sinew, :columns, :rows, :csv
11
+
12
+ def initialize(sinew)
13
+ @sinew = sinew
14
+ @rows = []
15
+ end
16
+
17
+ def filename
18
+ @filename ||= begin
19
+ recipe = sinew.options[:recipe]
20
+ ext = File.extname(recipe)
21
+ if ext.empty?
22
+ "#{recipe}.csv"
23
+ else
24
+ recipe.gsub(ext, '.csv')
25
+ end
26
+ end
27
+ end
28
+
29
+ def header(columns)
30
+ sinew.banner("Writing to #{filename}...") if !sinew.quiet?
31
+
32
+ columns = columns.flatten
33
+ @columns = columns
34
+
35
+ # open csv, write header row
36
+ @csv = CSV.open(filename, 'wb')
37
+ csv << columns
38
+ end
39
+
40
+ def emit(row)
41
+ # implicit header if necessary
42
+ header(row.keys) if !csv
43
+
44
+ rows << row.dup
45
+
46
+ # map columns to row, and normalize along the way
47
+ print = {}
48
+ row = columns.map do |i|
49
+ value = normalize(row[i])
50
+ print[i] = value if value.present?
51
+ value
52
+ end
53
+
54
+ # print
55
+ sinew.vputs print.ai
56
+
57
+ csv << row
58
+ csv.flush
59
+ end
60
+
61
+ def count
62
+ rows.length
63
+ end
64
+
65
+ def report
66
+ return if count == 0
67
+
68
+ sinew.banner("Got #{count} rows.")
69
+
70
+ # calculate counts
71
+ counts = Hash.new(0)
72
+ rows.each do |row|
73
+ row.each_pair { |k, v| counts[k] += 1 if v.present? }
74
+ end
75
+ # sort by counts
76
+ cols = columns.sort_by { |i| [ -counts[i], i ] }
77
+
78
+ # report
79
+ len = cols.map { |i| i.to_s.length }.max
80
+ fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
81
+ cols.each do |col|
82
+ $stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
83
+ end
84
+ end
85
+
86
+ def normalize(s)
87
+ # noko/array/misc => string
88
+ s = case s
89
+ when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
90
+ s.inner_html
91
+ when Array
92
+ s.map(&:to_s).join('|')
93
+ else
94
+ s.to_s
95
+ end
96
+
97
+ #
98
+ # Below uses stringex
99
+ #
100
+ # github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
101
+ # github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
102
+ #
103
+
104
+ # <a>b</a> => b
105
+ s = s.strip_html_tags
106
+
107
+ # Converts MS Word 'smart punctuation' to ASCII
108
+ s = s.convert_smart_punctuation
109
+
110
+ # "&aacute;".convert_accented_html_entities # => "a"
111
+ s = s.convert_accented_html_entities
112
+
113
+ # &amp, &frac, etc.
114
+ s = s.convert_miscellaneous_html_entities
115
+
116
+ # convert unicode => regular characters
117
+ s = s.to_ascii
118
+
119
+ # squish
120
+ s = s.squish
121
+
122
+ s
123
+ end
124
+ protected :normalize
125
+ end
126
+ end
@@ -0,0 +1,148 @@
1
+ require 'digest/md5'
2
+ require 'httparty'
3
+ require 'htmlentities'
4
+
5
+ #
6
+ # Process a single HTTP request. Mostly a wrapper around HTTParty.
7
+ #
8
+
9
+ module Sinew
10
+ class Error < StandardError; end
11
+
12
+ class Request
13
+ HTML_ENTITIES = HTMLEntities.new
14
+ VALID_METHODS = %w[get post patch put delete head options].freeze
15
+
16
+ attr_reader :sinew, :method, :uri, :options, :cache_key
17
+
18
+ # Options are largely compatible with HTTParty, except for :method.
19
+ def initialize(sinew, method, url, options = {})
20
+ @sinew = sinew
21
+ @method = method
22
+ @options = options.dup
23
+ @uri = parse_url(url)
24
+ @cache_key = calculate_cache_key
25
+ end
26
+
27
+ # run the request, return the result
28
+ def perform
29
+ validate!
30
+
31
+ # merge global/options headers
32
+ headers = sinew.runtime_options.headers
33
+ headers = headers.merge(options[:headers]) if options[:headers]
34
+ options[:headers] = headers
35
+
36
+ party_response = HTTParty.send(method, uri, options)
37
+ Response.from_network(self, party_response)
38
+ end
39
+
40
+ # We accept sloppy urls and attempt to clean them up
41
+ def parse_url(url)
42
+ s = url
43
+
44
+ # remove entities
45
+ s = HTML_ENTITIES.decode(s)
46
+
47
+ # fix a couple of common encoding bugs
48
+ s = s.gsub(' ', '%20')
49
+ s = s.gsub("'", '%27')
50
+
51
+ # append query manually (instead of letting HTTParty handle it) so we can
52
+ # include it in cache_key
53
+ query = options.delete(:query)
54
+ if query.present?
55
+ q = HTTParty::HashConversions.to_params(query)
56
+ separator = s.include?('?') ? '&' : '?'
57
+ s = "#{s}#{separator}#{q}"
58
+ end
59
+
60
+ URI.parse(s)
61
+ end
62
+ protected :parse_url
63
+
64
+ def calculate_cache_key
65
+ dir = pathify(uri.host)
66
+
67
+ body_key = if body.is_a?(Hash)
68
+ HTTParty::HashConversions.to_params(body)
69
+ else
70
+ body&.dup
71
+ end
72
+
73
+ # build key, as a hash for before_generate_cache_key
74
+ key = {
75
+ method: method.dup,
76
+ path: uri.path,
77
+ query: uri.query,
78
+ body: body_key,
79
+ }
80
+ key = sinew.runtime_options.before_generate_cache_key.call(key)
81
+
82
+ # strip method for gets
83
+ key.delete(:method) if key[:method] == 'get'
84
+
85
+ # pull out the values, join and pathify
86
+ path = key.values.select(&:present?).join(',')
87
+ path = pathify(path)
88
+
89
+ # shorten long paths
90
+ if path.length > 250
91
+ path = Digest::MD5.hexdigest(path)
92
+ end
93
+
94
+ "#{dir}/#{path}"
95
+ end
96
+ protected :calculate_cache_key
97
+
98
+ def validate!
99
+ raise "invalid method #{method}" if !VALID_METHODS.include?(method)
100
+ raise "invalid url #{uri}" if uri.scheme !~ /^http/
101
+ raise "can't get with a body" if method == 'get' && body
102
+ raise "Content-Type doesn't make sense without a body" if content_type && !body
103
+ end
104
+ protected :validate!
105
+
106
+ def body
107
+ options[:body]
108
+ end
109
+ protected :body
110
+
111
+ def headers
112
+ options[:headers]
113
+ end
114
+ protected :headers
115
+
116
+ def content_type
117
+ headers && headers['Content-Type']
118
+ end
119
+ protected :content_type
120
+
121
+ def form?
122
+ content_type == 'application/x-www-form-urlencoded'
123
+ end
124
+ protected :form?
125
+
126
+ def pathify(s)
127
+ # remove leading slash
128
+ s = s.gsub(/^\//, '')
129
+ # .. => comma
130
+ s = s.gsub('..', ',')
131
+ # query separators => comma
132
+ s = s.gsub(/[?\/&]/, ',')
133
+ # ,, => comma
134
+ s = s.gsub(',,', ',')
135
+ # encode invalid path chars
136
+ s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
137
+ hex = i.unpack('H2').first
138
+ "%#{hex}"
139
+ end
140
+ # handle empty case
141
+ s = '_root_' if s.blank?
142
+ # always downcase
143
+ s = s.downcase
144
+ s
145
+ end
146
+ protected :pathify
147
+ end
148
+ end
@@ -0,0 +1,75 @@
1
+ #
2
+ # An HTTP response. Mostly a wrapper around HTTParty.
3
+ #
4
+
5
+ module Sinew
6
+ class Response
7
+ attr_accessor :request, :uri, :body, :code, :headers
8
+
9
+ def self.from_network(request, party_response)
10
+ Response.new.tap do |response|
11
+ response.request = request
12
+ response.uri = party_response.request.last_uri
13
+ response.code = party_response.code
14
+ response.headers = party_response.headers.to_h
15
+
16
+ # force to utf-8 as best we can
17
+ body = party_response.body
18
+ if body.encoding != Encoding::UTF_8
19
+ body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
20
+ end
21
+ response.body = body
22
+ end
23
+ end
24
+
25
+ def self.from_cache(request, body, head)
26
+ Response.new.tap do |response|
27
+ response.request = request
28
+ response.body = body
29
+
30
+ # defaults
31
+ response.uri = request.uri
32
+ response.code = 200
33
+ response.headers = {}
34
+
35
+ # overwrite with cached response headers
36
+ if head
37
+ head = JSON.parse(head, symbolize_names: true)
38
+ response.uri = URI.parse(head[:uri])
39
+ response.code = head[:code]
40
+ response.headers = head[:headers]
41
+ end
42
+ end
43
+ end
44
+
45
+ def self.from_timeout(request)
46
+ Response.new.tap do |response|
47
+ response.request = request
48
+ response.uri = request.uri
49
+ response.body = 'timeout'
50
+ response.code = 999
51
+ response.headers = {}
52
+ end
53
+ end
54
+
55
+ def error?
56
+ code >= 400
57
+ end
58
+
59
+ def error_500?
60
+ code / 100 >= 5
61
+ end
62
+
63
+ def redirected?
64
+ request.uri != uri
65
+ end
66
+
67
+ def head_as_json
68
+ {
69
+ uri: uri,
70
+ code: code,
71
+ headers: headers,
72
+ }
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,26 @@
1
+ #
2
+ # Runtime options that sinew files can modify.
3
+ #
4
+
5
+ module Sinew
6
+ class RuntimeOptions
7
+ attr_accessor :retries
8
+ attr_accessor :rate_limit
9
+ attr_accessor :headers
10
+ attr_accessor :before_generate_cache_key
11
+
12
+ def initialize
13
+ self.retries = 3
14
+ self.rate_limit = 1
15
+ self.headers = {
16
+ 'User-Agent' => "sinew/#{VERSION}",
17
+ }
18
+ self.before_generate_cache_key = ->(i) { i }
19
+
20
+ # for testing
21
+ if ENV['SINEW_TEST']
22
+ self.rate_limit = 0
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = "1.0.4"
3
+ VERSION = '2.0.0'.freeze
4
4
  end
@@ -1,7 +1,9 @@
1
- get "http://httpbin.org"
2
- noko.css("ul li a").each do |a|
3
- row = { }
1
+ get 'http://httpbin.org'
2
+ noko.css('ul li a').each do |a|
3
+ row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
6
  csv_emit(row)
7
7
  end
8
+
9
+ get 'http://httpbin.org/redirect/2'
@@ -1,29 +1,34 @@
1
- $LOAD_PATH << File.expand_path("../lib", __FILE__)
1
+ $LOAD_PATH.unshift("#{__dir__}/lib")
2
2
 
3
- require "sinew/version"
3
+ require 'sinew/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = "sinew"
6
+ s.name = 'sinew'
7
7
  s.version = Sinew::VERSION
8
8
  s.platform = Gem::Platform::RUBY
9
- s.authors = ["Adam Doppelt"]
10
- s.email = ["amd@gurge.com"]
11
- s.homepage = "http://github.com/gurgeous/sinew"
12
- s.summary = "Sinew - structured web crawling using recipes."
13
- s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
9
+ s.license = 'MIT'
10
+ s.authors = [ 'Adam Doppelt' ]
11
+ s.email = [ 'amd@gurge.com' ]
12
+ s.homepage = 'http://github.com/gurgeous/sinew'
13
+ s.summary = 'Sinew - structured web crawling using recipes.'
14
+ s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
15
+ s.required_ruby_version = '~> 2.3'
14
16
 
15
- s.rubyforge_project = "sinew"
17
+ s.rubyforge_project = 'sinew'
16
18
 
17
- s.add_runtime_dependency "activesupport", "~> 3.0"
18
- s.add_runtime_dependency "awesome_print"
19
- s.add_runtime_dependency "htmlentities"
20
- s.add_runtime_dependency "nokogiri"
21
- s.add_runtime_dependency "stringex", "~> 2.0"
22
- s.add_runtime_dependency "trollop"
23
- s.add_development_dependency "rake"
19
+ s.add_runtime_dependency 'awesome_print', '~> 1.8'
20
+ s.add_runtime_dependency 'htmlentities', '~> 4.3'
21
+ s.add_runtime_dependency 'httparty', '~> 0.16'
22
+ s.add_runtime_dependency 'nokogiri', '~> 1.8'
23
+ s.add_runtime_dependency 'scripto', '~> 0'
24
+ s.add_runtime_dependency 'slop', '~> 4.6'
25
+ s.add_runtime_dependency 'stringex', '~> 2.8'
26
+ s.add_development_dependency 'minitest', '~> 5.11'
27
+ s.add_development_dependency 'rake', '~> 12.3'
28
+ s.add_development_dependency 'webmock', '~> 3.4'
24
29
 
25
30
  s.files = `git ls-files`.split("\n")
26
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
28
- s.require_paths = ["lib"]
31
+ s.test_files = `git ls-files -- test/*`.split("\n")
32
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
33
+ s.require_paths = [ 'lib' ]
29
34
  end