news_scraper 0.1.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e571db5d64bc7679b4b061981208026da6878101
4
- data.tar.gz: 4009ca8b1571847afc1756eb5678e634feb0ba54
3
+ metadata.gz: db7d631f3f6cf73ff2e57b9e472804651b9fe1e0
4
+ data.tar.gz: 1045878eb97749d6b264a486ac34bfb89f4796dd
5
5
  SHA512:
6
- metadata.gz: fcb8a793e259163dec3bcdf260bdd4accb3275f11f1ae5df05ce0b40ff3708bf9113be3c8021846ed4f0497e24f5bddffc4165408fd3ef4e57c881f5a6d8d501
7
- data.tar.gz: be27fea65bc420052cdae19f7ae38135565fed7ff8730e5bdf4445058af34a30bbfed006c5cb5f901b6688020d892c70a322adaa5d6d4c5270521746e12d6fa6
6
+ metadata.gz: a53423be5dbda33ead7dbb46bc494e40fcf412172a291496128b40512c985fc157c646481e1b8a183be2f709486e2cc7ec27d47a17bb9715c3b19dbe09dd7e42
7
+ data.tar.gz: eb43f129a0ca1a9f6eb02f24bfeb891583c96b0cce548afe43cf613231e8908a66e381347aeb16f5be9ea4af3ca96f82dcaf4da005951fb5ae5936d3f3530cbc
data/.gitignore CHANGED
@@ -8,3 +8,4 @@
8
8
  /spec/reports/
9
9
  /tmp/
10
10
  *.gem
11
+ coverage
data/.rubocop.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  AllCops:
2
- TargetRubyVersion: 2.3
2
+ TargetRubyVersion: 2.2
3
3
 
4
4
  ClassLength:
5
5
  Max: 500
data/README.md CHANGED
@@ -50,7 +50,7 @@ How the `Scraper` extracts and parses for the information is determined by scrap
50
50
 
51
51
  Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
52
52
 
53
- In addition, the `uri` and `root_domain`(hostname) of the article will be returned in the hash too.
53
+ In addition, the `url` and `root_domain`(hostname) of the article will be returned in the hash too.
54
54
 
55
55
  Example
56
56
  ```
@@ -62,7 +62,7 @@ Example
62
62
  section: 'technology',
63
63
  datetime: '1991-10-05T12:00:00+00:00',
64
64
  title: 'Linus Linux',
65
- uri: 'linusworld.com/the-linux-kernel',
65
+ url: 'https://linusworld.com/the-linux-kernel',
66
66
  root_domain: 'linusworld.com'
67
67
  }
68
68
  ```
@@ -52,10 +52,16 @@ presets:
52
52
  article_tag: &article_tag_keywords
53
53
  method: "xpath"
54
54
  pattern: "//meta[@property='article:tag']/@content"
55
+ news_keywords: &news_keywords_keywords
56
+ method: "xpath"
57
+ pattern: "//meta[@name='news_keywords']/@content"
55
58
  section:
56
59
  meta: &meta_section
57
60
  method: "xpath"
58
61
  pattern: "//meta[@property='article:section']/@content"
62
+ section: &section_section
63
+ method: "xpath"
64
+ pattern: "//meta[@name='section']/@content"
59
65
  datetime:
60
66
  article_date_original: &article_date_original_datetime
61
67
  method: xpath
@@ -87,6 +93,15 @@ presets:
87
93
  sailthru_date: &sailthru_date_datetime
88
94
  method: xpath
89
95
  pattern: //meta[@name='sailthru.date']/@content
96
+ time: &time_datetime
97
+ method: xpath
98
+ pattern: //time/@datetime
99
+ date_published_datetime: &date_published_datetime_datetime
100
+ method: xpath
101
+ pattern: //meta[@itemprop="datePublished"]/@datetime
102
+ date_published_content: &date_published_content_datetime
103
+ method: xpath
104
+ pattern: //meta[@itemprop="datePublished"]/@content
90
105
  title:
91
106
  html: &html_title
92
107
  method: "xpath"
@@ -6,14 +6,13 @@ module NewsScraper
6
6
 
7
7
  DEFAULT_COLOR = "\x1b[36m".freeze
8
8
 
9
- def log(message, color: DEFAULT_COLOR, new_line: false)
10
- message += "\n" if new_line
9
+ def log(message, color: DEFAULT_COLOR)
11
10
  $stdout.puts "#{color}┃\x1b[0m " + message
12
11
  end
13
12
 
14
- def log_lines(message, color: DEFAULT_COLOR, new_line: false)
13
+ def log_lines(message, color: DEFAULT_COLOR)
15
14
  message.split("\n").each do |line|
16
- log(line, color: color, new_line: new_line)
15
+ log(line, color: color)
17
16
  end
18
17
  end
19
18
 
@@ -49,8 +48,8 @@ module NewsScraper
49
48
  buf = -1
50
49
  available = (1..options.length).to_a
51
50
  until available.include?(buf.to_i)
52
- begin
53
- buf = Readline.readline("\x1b[34m┃ > \x1b[33m", true)
51
+ buf = begin
52
+ Readline.readline("\x1b[34m┃ > \x1b[33m", true)
54
53
  rescue Interrupt
55
54
  nil
56
55
  end
@@ -71,14 +70,16 @@ module NewsScraper
71
70
 
72
71
  ## Fancy Headers and Footers
73
72
 
74
- def put_header(text = "", color = DEFAULT_COLOR)
73
+ def put_header(text = "", color: DEFAULT_COLOR)
75
74
  put_edge(color, "┏━━ ", text)
76
75
  end
77
76
 
78
- def put_footer(color = DEFAULT_COLOR)
77
+ def put_footer(color: DEFAULT_COLOR)
79
78
  put_edge(color, "┗", "")
80
79
  end
81
80
 
81
+ private
82
+
82
83
  def put_edge(color, prefix, text)
83
84
  ptext = "#{color}#{prefix}#{text}"
84
85
  textwidth = printing_width(ptext)
@@ -0,0 +1,33 @@
1
+ module NewsScraper
2
+ class Configuration
3
+ DEFAULT_SCRAPE_PATTERNS_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
4
+ attr_accessor :fetch_method, :scrape_patterns_filepath
5
+
6
+ # <code>NewsScraper::Configuration.initialize</code> initializes the scrape_patterns_filepath
7
+ # and the fetch_method to the <code>DEFAULT_SCRAPE_PATTERNS_FILEPATH</code>
8
+ #
9
+ # Set the <code>scrape_patterns_filepath</code> to <code>nil</code> to disable saving during training
10
+ #
11
+ def initialize
12
+ self.scrape_patterns_filepath = DEFAULT_SCRAPE_PATTERNS_FILEPATH
13
+ self.fetch_method = proc { default_scrape_patterns }
14
+ end
15
+
16
+ # <code>NewsScraper::Configuration.scrape_patterns</code> proxies scrape_patterns
17
+ # requests to <code>fetch_method</code>:
18
+ #
19
+ # *Returns*
20
+ # - The result of calling the <code>fetch_method</code> proc, expected to be a hash
21
+ #
22
+ def scrape_patterns
23
+ fetch_method.call
24
+ end
25
+
26
+ private
27
+
28
+ def default_scrape_patterns
29
+ @default_scrape_patterns ||= {}
30
+ @default_scrape_patterns[scrape_patterns_filepath] ||= YAML.load_file(scrape_patterns_filepath)
31
+ end
32
+ end
33
+ end
@@ -1,14 +1,23 @@
1
1
 
2
2
  module NewsScraper
3
- class ResponseError < StandardError; end
3
+ class ResponseError < StandardError
4
+ attr_reader :error_code, :message, :url
5
+
6
+ def initialize(opts = {})
7
+ @error_code = opts[:error_code]
8
+ @message = opts[:message]
9
+ @url = opts[:url]
10
+ super
11
+ end
12
+ end
4
13
 
5
14
  module Transformers
6
15
  class ScrapePatternNotDefined < StandardError
7
- attr_reader :root_domain, :uri
16
+ attr_reader :root_domain, :url
8
17
 
9
18
  def initialize(opts = {})
10
19
  @root_domain = opts[:root_domain]
11
- @uri = opts[:uri]
20
+ @url = opts[:url]
12
21
  super
13
22
  end
14
23
  end
@@ -10,9 +10,13 @@ module NewsScraper
10
10
 
11
11
  CLI.put_header(url)
12
12
  CLI.log "Beginning HTTP request for #{url}"
13
- response = HTTParty.get(url)
13
+ response = HTTParty.get(url, headers: { "User-Agent" => "news-scraper-#{NewsScraper::VERSION}" })
14
14
 
15
- raise ResponseError.new("#{response.code} - #{response.message}") unless response.code == 200
15
+ raise ResponseError.new(
16
+ error_code: response.code,
17
+ message: response.message,
18
+ url: url
19
+ ) unless response.code == 200
16
20
 
17
21
  CLI.log "#{response.code} - #{response.message}. Request successful for #{url}"
18
22
  CLI.put_footer
@@ -16,6 +16,7 @@ module NewsScraper
16
16
  #
17
17
  # *Raises*
18
18
  # - Will raise a <code>Transformers::ScrapePatternNotDefined</code> if an article is not in the root domains
19
+ # - Will <code>yield</code> the error if a block is given
19
20
  # - Root domains are specified by the <code>article_scrape_patterns.yml</code> file
20
21
  # - This root domain will need to be trained, it would be helpful to have a PR created to train the domain
21
22
  # - You can train the domain by running <code>NewsScraper::Trainer::UrlTrainer.new(URL_TO_TRAIN).train</code>
@@ -27,13 +28,19 @@ module NewsScraper
27
28
  article_urls = Extractors::GoogleNewsRss.new(query: @query).extract
28
29
 
29
30
  transformed_articles = []
31
+
30
32
  article_urls.each do |article_url|
31
33
  payload = Extractors::Article.new(url: article_url).extract
34
+ article_transformer = Transformers::Article.new(url: article_url, payload: payload)
32
35
 
33
- transformed_article = Transformers::Article.new(url: article_url, payload: payload).transform
34
- transformed_articles << transformed_article
35
-
36
- yield transformed_article if block_given?
36
+ begin
37
+ transformed_article = article_transformer.transform
38
+ transformed_articles << transformed_article
39
+ yield transformed_article if block_given?
40
+ rescue Transformers::ScrapePatternNotDefined => e
41
+ raise e unless block_given?
42
+ yield e
43
+ end
37
44
  end
38
45
 
39
46
  transformed_articles
@@ -3,18 +3,16 @@ module NewsScraper
3
3
  class PresetSelector
4
4
  PROVIDER_PHRASE = 'I will provide a pattern using'.freeze
5
5
 
6
- def initialize(data_type:, data_type_presets:, url:, payload:)
6
+ def initialize(url:, payload:)
7
7
  @url = url
8
8
  @payload = payload
9
- @data_type_presets = data_type_presets
10
- @data_type = data_type
11
9
  end
12
10
 
13
- def select
14
- return unless @data_type_presets
11
+ def select(data_type)
12
+ pattern_options = pattern_options(data_type)
15
13
 
16
14
  selected_option = CLI.prompt_with_options(
17
- "Select which preset to use for #{@data_type}:",
15
+ "Select which preset to use for #{data_type}:",
18
16
  pattern_options.keys
19
17
  )
20
18
 
@@ -27,50 +25,42 @@ module NewsScraper
27
25
  end
28
26
  return if selected_option == 'skip'
29
27
 
30
- selected_index = pattern_options[selected_option]
31
- selected_preset_code = transform_results[selected_index].first
32
- @data_type_presets[selected_preset_code].merge('variable' => [selected_preset_code, @data_type].join('_'))
28
+ selected_preset_code = pattern_options[selected_option]
29
+ result = transform_results[data_type][selected_preset_code].merge(
30
+ 'variable' => [selected_preset_code, data_type].join('_')
31
+ )
32
+ result.delete('data')
33
+ result
33
34
  end
34
35
 
35
36
  private
36
37
 
37
- def pattern_options
38
- return {} unless @data_type_presets
39
-
40
- @pattern_options ||= begin
41
- temp_options = transform_results.each_with_object({}).with_index do |(results, options_hash), index|
42
- preset_name = "#{results[0]}_#{@data_type}"
43
- extracted_text = results[1]
44
- options_hash["#{preset_name}: #{extracted_text}"] = index
45
- end
46
- %w(xpath css).each do |pattern_provider|
47
- temp_options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
38
+ def pattern_options(data_type)
39
+ # Add valid options from the transformed results
40
+ options = transform_results[data_type].each_with_object({}) do |(option, details), valid_options|
41
+ next unless details['data'] && !details['data'].empty?
42
+ table_key = Terminal::Table.new do |t|
43
+ t << ['method', details['method']]
44
+ t << ['pattern', details['pattern']]
45
+ t << ['data', details['data']]
48
46
  end
49
- temp_options.merge('skip' => 'skip')
47
+ valid_options["\n#{table_key}"] = option
50
48
  end
51
- end
52
49
 
53
- def transform_results
54
- return {} unless @data_type_presets
55
-
56
- scrape_details = blank_scrape_details
57
- @results ||= @data_type_presets.each_with_object({}) do |(preset_name, preset_details), hash|
58
- scrape_details[@data_type] = preset_details
59
- train_transformer = Transformers::TrainerArticle.new(
60
- url: @url,
61
- payload: @payload,
62
- scrape_details: scrape_details,
63
- )
50
+ # Add in options to customize the pattern
51
+ %w(xpath css).each do |pattern_provider|
52
+ options["#{PROVIDER_PHRASE} #{pattern_provider}"] = pattern_provider
53
+ end
64
54
 
65
- transformed_result = train_transformer.transform[@data_type.to_sym]
66
- hash[preset_name] = transformed_result if transformed_result && !transformed_result.empty?
67
- end.to_a
55
+ # Add option in to skip
56
+ options.merge('skip' => 'skip')
68
57
  end
69
58
 
70
- def blank_scrape_details
71
- @blank_scrape_details ||= Constants::SCRAPE_PATTERNS.each_with_object({}) do |data_type, hash|
72
- hash[data_type] = nil
73
- end
59
+ def transform_results
60
+ @transform_results ||= Transformers::TrainerArticle.new(
61
+ url: @url,
62
+ payload: @payload
63
+ ).transform
74
64
  end
75
65
  end
76
66
  end
@@ -8,18 +8,17 @@ module NewsScraper
8
8
  end
9
9
 
10
10
  def train
11
- return if article_scrape_patterns['domains'].key?(@root_domain)
11
+ return if NewsScraper.configuration.scrape_patterns['domains'].key?(@root_domain)
12
12
 
13
13
  CLI.put_header(@root_domain)
14
- CLI.log("There is no scrape pattern defined for #{@root_domain} in #{Constants::SCRAPE_PATTERN_FILEPATH}")
14
+ CLI.log("There is no scrape pattern defined for #{@root_domain}")
15
15
  CLI.log "Fetching information..."
16
16
  CLI.put_footer
17
17
 
18
18
  selected_presets = {}
19
- article_scrape_patterns['data_types'].each do |data_type|
19
+ NewsScraper.configuration.scrape_patterns['data_types'].each do |data_type|
20
20
  selected_presets[data_type] = selected_pattern(data_type)
21
21
  end
22
-
23
22
  save_selected_presets(selected_presets)
24
23
  end
25
24
 
@@ -27,29 +26,29 @@ module NewsScraper
27
26
 
28
27
  def selected_pattern(data_type)
29
28
  CLI.put_header("Determining information for #{data_type}")
30
- data_type_presets = article_scrape_patterns['presets'][data_type]
31
- pattern = if data_type_presets.nil?
29
+ pattern = if NewsScraper.configuration.scrape_patterns['presets'][data_type].nil?
32
30
  CLI.log("No presets were found for #{data_type}. Skipping to next.")
33
31
  nil
34
32
  else
35
- PresetSelector.new(
36
- url: @url,
37
- payload: @payload,
38
- data_type_presets: data_type_presets,
39
- data_type: data_type
40
- ).select
33
+ preset_selector.select(data_type)
41
34
  end
42
35
  CLI.put_footer
43
-
44
36
  pattern || { 'method' => "<<<<< TODO >>>>>", 'pattern' => "<<<<< TODO >>>>>" }
45
37
  end
46
38
 
39
+ def preset_selector
40
+ @preset_selector ||= PresetSelector.new(url: @url, payload: @payload)
41
+ end
42
+
47
43
  def save_selected_presets(selected_presets)
48
- current_content = File.read(Constants::SCRAPE_PATTERN_FILEPATH).chomp
44
+ return unless NewsScraper.configuration.scrape_patterns_filepath
45
+
46
+ current_content = File.read(NewsScraper.configuration.scrape_patterns_filepath).chomp
49
47
  new_content = "#{current_content}\n#{build_domain_yaml(selected_presets)}\n"
50
48
 
51
- File.write(Constants::SCRAPE_PATTERN_FILEPATH, new_content)
52
- CLI.log("Successfully wrote presets for #{@root_domain} to #{Constants::SCRAPE_PATTERN_FILEPATH}.")
49
+ File.write(NewsScraper.configuration.scrape_patterns_filepath, new_content)
50
+ CLI.log("Successfully wrote presets for #{@root_domain} to"\
51
+ " #{NewsScraper.configuration.scrape_patterns_filepath}.")
53
52
  end
54
53
 
55
54
  def build_domain_yaml(selected_presets)
@@ -65,10 +64,6 @@ module NewsScraper
65
64
  end
66
65
  output_string.join("\n")
67
66
  end
68
-
69
- def article_scrape_patterns
70
- @article_scrape_patterns ||= YAML.load_file(Constants::SCRAPE_PATTERN_FILEPATH)
71
- end
72
67
  end
73
68
  end
74
69
  end
@@ -2,6 +2,7 @@ require 'nokogiri'
2
2
  require 'sanitize'
3
3
  require 'readability'
4
4
  require 'htmlbeautifier'
5
+ require 'news_scraper/transformers/nokogiri/functions'
5
6
 
6
7
  module NewsScraper
7
8
  module Transformers
@@ -13,9 +14,8 @@ module NewsScraper
13
14
  # - <code>payload</code>: keyword arg - the result of the scrape
14
15
  #
15
16
  def initialize(url:, payload:)
16
- uri_parser = URIParser.new(url)
17
- @uri = uri_parser.without_scheme
18
- @root_domain = uri_parser.host
17
+ @url = url
18
+ @root_domain = URIParser.new(url).host
19
19
  @payload = payload
20
20
  end
21
21
 
@@ -28,37 +28,36 @@ module NewsScraper
28
28
  # - <code>transformed_response</code>: the response that has been parsed and transformed to a hash
29
29
  #
30
30
  def transform
31
- raise ScrapePatternNotDefined.new(uri: @uri, root_domain: @root_domain) unless scrape_details
32
-
33
- transformed_response.merge(uri: @uri, root_domain: @root_domain)
31
+ scrape_details = NewsScraper.configuration.scrape_patterns['domains'][@root_domain]
32
+ raise ScrapePatternNotDefined.new(url: @url, root_domain: @root_domain) unless scrape_details
33
+ transformed_response(scrape_details).merge(url: @url, root_domain: @root_domain)
34
34
  end
35
35
 
36
36
  private
37
37
 
38
- def scrape_details
39
- @scrape_details ||= Constants::SCRAPE_PATTERNS['domains'][@root_domain]
40
- end
38
+ def transformed_response(scrape_details)
39
+ NewsScraper.configuration.scrape_patterns['data_types'].each_with_object({}) do |data_type, response|
40
+ response[data_type.to_sym] = nil
41
+ next unless scrape_details[data_type]
41
42
 
42
- def transformed_response
43
- Constants::SCRAPE_PATTERNS['data_types'].each_with_object({}) do |data_type, response|
44
- response[data_type.to_sym] = parsed_data(data_type)
43
+ response[data_type.to_sym] = parsed_data(
44
+ scrape_details[data_type]['method'].to_sym,
45
+ scrape_details[data_type]['pattern']
46
+ )
45
47
  end
46
48
  end
47
49
 
48
- def parsed_data(data_type)
49
- return nil unless scrape_details[data_type]
50
-
51
- scrape_method = scrape_details[data_type]['method'].to_sym
50
+ def parsed_data(scrape_method, scrape_pattern)
52
51
  case scrape_method
53
52
  when :xpath
54
- noko_html = Nokogiri::HTML(@payload)
53
+ noko_html = ::Nokogiri::HTML(@payload)
55
54
  Sanitize.fragment(
56
- noko_html.send(scrape_method, "(#{scrape_details[data_type]['pattern']})[1]")
55
+ noko_html.xpath("(#{scrape_pattern})[1]", Nokogiri::Functions.new)
57
56
  ).squish
58
57
  when :css
59
- noko_html = Nokogiri::HTML(@payload)
58
+ noko_html = ::Nokogiri::HTML(@payload)
60
59
  Sanitize.fragment(
61
- noko_html.send(scrape_method, scrape_details[data_type]['pattern'])
60
+ noko_html.css(scrape_pattern)
62
61
  ).squish
63
62
  when :readability
64
63
  content = Readability::Document.new(
@@ -0,0 +1,15 @@
1
+ require 'nokogiri'
2
+
3
+ module NewsScraper
4
+ module Transformers
5
+ module Nokogiri
6
+ class Functions
7
+ # Implements fn:string-join of XPath 2.0
8
+ def string_join(nodeset, separator)
9
+ nodeset.map(&:text).join(separator)
10
+ end
11
+ alias_method :'string-join', :string_join
12
+ end
13
+ end
14
+ end
15
+ end
@@ -6,12 +6,26 @@ module NewsScraper
6
6
  # *Params*
7
7
  # - <code>url</code>: keyword arg - the url on which scraping was done
8
8
  # - <code>payload</code>: keyword arg - the result of the scrape
9
- # - <code>scrape_details</code>: keyword arg - The pattern/methods for the domain to use in the transformation
10
9
  #
11
- def initialize(url:, payload:, scrape_details:)
12
- @scrape_details = scrape_details
10
+ def initialize(url:, payload:)
13
11
  super(url: url, payload: payload)
14
12
  end
13
+
14
+ # Transform the article
15
+ #
16
+ # *Returns*
17
+ # - <code>transformed_response</code>: tries all possible presets and returns a hash representing the results
18
+ #
19
+ def transform
20
+ presets = NewsScraper.configuration.scrape_patterns['presets']
21
+ transformed_response = presets.each_with_object({}) do |(data_type, preset_options), response|
22
+ response[data_type] = preset_options.each_with_object({}) do |(option, scrape_details), data_type_options|
23
+ data = parsed_data(scrape_details['method'].to_sym, scrape_details['pattern'])
24
+ data_type_options[option] = scrape_details.merge('data' => data)
25
+ end
26
+ end
27
+ transformed_response.merge('url' => @url, 'root_domain' => @root_domain)
28
+ end
15
29
  end
16
30
  end
17
31
  end
@@ -1,3 +1,3 @@
1
1
  module NewsScraper
2
- VERSION = "0.1.2".freeze
2
+ VERSION = "1.0.0".freeze
3
3
  end
data/lib/news_scraper.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'httparty'
2
2
  require 'yaml'
3
+ require 'terminal-table'
3
4
 
4
- require 'news_scraper/constants'
5
+ require 'news_scraper/configuration'
5
6
  require 'news_scraper/uri_parser'
6
7
  require 'news_scraper/active_support_lite/string'
7
8
 
@@ -23,11 +24,12 @@ require 'news_scraper/trainer'
23
24
 
24
25
  module NewsScraper
25
26
  extend self
27
+ attr_writer :configuration
26
28
 
27
29
  # <code>NewsScraper::train</code> is an interactive command-line prompt that:
28
30
  #
29
31
  # 1. Collates all articles for the given :query
30
- # 2. Grep for <code>:data_types</code> using <code>:presets</code> in <code>config/article_scrape_patterns.yml</code>
32
+ # 2. Grep for <code>:data_types</code> using <code>:presets</code> in the config set in the <code>configuration</code>
31
33
  # 3. Displays the results of each <code>:preset</code> grep for a given <code>:data_type</code>
32
34
  # 4. Prompts to select one of the <code>:presets</code> or define a pattern for that domain's <code>:data_type</code>
33
35
  # N.B: User may ignore all presets and manually configure it in the YAML file
@@ -36,7 +38,21 @@ module NewsScraper
36
38
  # *Params*
37
39
  # - <code>query</code>: a keyword arugment specifying the query to train on
38
40
  #
41
+ # :nocov:
39
42
  def train(query:)
40
43
  Trainer.train(query: query)
41
44
  end
45
+ # :nocov:
46
+
47
+ def configuration
48
+ @configuration ||= Configuration.new
49
+ end
50
+
51
+ def reset_configuration
52
+ @configuration = Configuration.new
53
+ end
54
+
55
+ def configure
56
+ yield(configuration)
57
+ end
42
58
  end
data/news_scraper.gemspec CHANGED
@@ -29,6 +29,7 @@ Gem::Specification.new do |spec|
29
29
  spec.add_dependency 'sanitize', '~> 4.2', '>= 4.2.0'
30
30
  spec.add_dependency 'ruby-readability', '~> 0.7', '>= 0.7.0'
31
31
  spec.add_dependency 'htmlbeautifier', '~> 1.1', '>= 1.1.1'
32
+ spec.add_dependency 'terminal-table', '~> 1.5', '>= 1.5.2'
32
33
 
33
34
  spec.add_development_dependency 'bundler', '~> 1.12', '>= 1.12.0'
34
35
  spec.add_development_dependency 'rake', '~> 10.0', '>= 10.0.0'
@@ -38,4 +39,5 @@ Gem::Specification.new do |spec|
38
39
  spec.add_development_dependency 'timecop', '~> 0.8', '>= 0.8.0'
39
40
  spec.add_development_dependency 'rubocop', '~> 0.42', '>= 0.42.0'
40
41
  spec.add_development_dependency 'rdoc', '~> 4.2', '>= 4.2.2'
42
+ spec.add_development_dependency 'simplecov', '~> 0.12.0'
41
43
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Richard Wu
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2016-09-19 00:00:00.000000000 Z
12
+ date: 2016-09-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -111,6 +111,26 @@ dependencies:
111
111
  - - ">="
112
112
  - !ruby/object:Gem::Version
113
113
  version: 1.1.1
114
+ - !ruby/object:Gem::Dependency
115
+ name: terminal-table
116
+ requirement: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - "~>"
119
+ - !ruby/object:Gem::Version
120
+ version: '1.5'
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: 1.5.2
124
+ type: :runtime
125
+ prerelease: false
126
+ version_requirements: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - "~>"
129
+ - !ruby/object:Gem::Version
130
+ version: '1.5'
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: 1.5.2
114
134
  - !ruby/object:Gem::Dependency
115
135
  name: bundler
116
136
  requirement: !ruby/object:Gem::Requirement
@@ -271,6 +291,20 @@ dependencies:
271
291
  - - ">="
272
292
  - !ruby/object:Gem::Version
273
293
  version: 4.2.2
294
+ - !ruby/object:Gem::Dependency
295
+ name: simplecov
296
+ requirement: !ruby/object:Gem::Requirement
297
+ requirements:
298
+ - - "~>"
299
+ - !ruby/object:Gem::Version
300
+ version: 0.12.0
301
+ type: :development
302
+ prerelease: false
303
+ version_requirements: !ruby/object:Gem::Requirement
304
+ requirements:
305
+ - - "~>"
306
+ - !ruby/object:Gem::Version
307
+ version: 0.12.0
274
308
  description: A collection of extractors, transformers and loaders for scraping news
275
309
  websites and syndicates.
276
310
  email:
@@ -296,7 +330,7 @@ files:
296
330
  - lib/news_scraper.rb
297
331
  - lib/news_scraper/active_support_lite/string.rb
298
332
  - lib/news_scraper/cli.rb
299
- - lib/news_scraper/constants.rb
333
+ - lib/news_scraper/configuration.rb
300
334
  - lib/news_scraper/errors.rb
301
335
  - lib/news_scraper/extractors/article.rb
302
336
  - lib/news_scraper/extractors/google_news_rss.rb
@@ -306,6 +340,7 @@ files:
306
340
  - lib/news_scraper/trainer/preset_selector.rb
307
341
  - lib/news_scraper/trainer/url_trainer.rb
308
342
  - lib/news_scraper/transformers/article.rb
343
+ - lib/news_scraper/transformers/nokogiri/functions.rb
309
344
  - lib/news_scraper/transformers/trainer_article.rb
310
345
  - lib/news_scraper/uri_parser.rb
311
346
  - lib/news_scraper/version.rb
@@ -1,6 +0,0 @@
1
- module NewsScraper
2
- module Constants
3
- SCRAPE_PATTERN_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
4
- SCRAPE_PATTERNS = YAML.load_file(SCRAPE_PATTERN_FILEPATH)
5
- end
6
- end