news_scraper 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cadeecdf95b6d5fd907671773fc3cc521fe756a2
4
+ data.tar.gz: 150d667251c00f96b01b195c99fe2a44f3da8da9
5
+ SHA512:
6
+ metadata.gz: 8af8f251dce23589d5e08af5f9c0510d031b91ed56188a0e6b3418b26397ad9ba98835a516c8e174d8aed3ea791793e5ceec8c3422a6e7731ac84196b49f287b
7
+ data.tar.gz: d55a513397be97f08b0adcbc975db4ad1c39d0887576979b138a50b5cb3601eb03036a9cfe75d1b0d4dcd2ab0303256c9112c865749905e5540f35bdefef18eb
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.gem
data/.rubocop.yml ADDED
@@ -0,0 +1,96 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.3
3
+
4
+ ClassLength:
5
+ Max: 500
6
+
7
+ ModuleLength:
8
+ Max: 500
9
+
10
+ Rails:
11
+ Enabled: false
12
+
13
+ Lint/AssignmentInCondition:
14
+ Enabled: false
15
+
16
+ Style/Documentation:
17
+ Enabled: false
18
+
19
+ Style/MultilineOperationIndentation:
20
+ Enabled: true
21
+
22
+ Style/AlignParameters:
23
+ EnforcedStyle: with_fixed_indentation
24
+
25
+ Style/FirstParameterIndentation:
26
+ EnforcedStyle: consistent
27
+
28
+ Style/TrailingCommaInLiteral:
29
+ Enabled: false
30
+
31
+ Style/TrailingCommaInArguments:
32
+ Enabled: false
33
+
34
+ Style/SignalException:
35
+ EnforcedStyle: only_raise
36
+
37
+ Style/NumericLiterals:
38
+ Enabled: true
39
+
40
+ Style/CaseIndentation:
41
+ IndentWhenRelativeTo: end
42
+
43
+ Style/IndentHash:
44
+ EnforcedStyle: consistent
45
+
46
+ Style/WordArray:
47
+ Enabled: true
48
+
49
+ Style/ModuleFunction:
50
+ Enabled: false
51
+
52
+ Style/RaiseArgs:
53
+ EnforcedStyle: compact
54
+
55
+ Metrics/AbcSize:
56
+ Enabled: false
57
+
58
+ Metrics/CyclomaticComplexity:
59
+ Enabled: false
60
+
61
+ Style/StringLiterals:
62
+ Enabled: false
63
+
64
+ Metrics/LineLength:
65
+ Max: 120
66
+
67
+ Metrics/ClassLength:
68
+ Enabled: false
69
+
70
+ Metrics/MethodLength:
71
+ Enabled: false
72
+
73
+ Metrics/ParameterLists:
74
+ Max: 5
75
+ CountKeywordArgs: false
76
+
77
+ Metrics/PerceivedComplexity:
78
+ Enabled: false
79
+
80
+ Lint/EndAlignment:
81
+ AlignWith: variable
82
+
83
+ Style/FrozenStringLiteralComment:
84
+ Enabled: false
85
+
86
+ Style/Alias:
87
+ EnforcedStyle: prefer_alias_method
88
+
89
+ Style/MutableConstant:
90
+ Enabled: true
91
+
92
+ Performance/Casecmp:
93
+ Enabled: true
94
+
95
+ Style/GuardClause:
96
+ Enabled: true
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at richardwu1997@gmail.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Richard Wu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # NewsScraper
2
+
3
+ ### Simple ETL news scraper in Ruby
4
+
5
+ [RubyGems](https://rubygems.org/gems/news_scraper)
6
+
7
+ A collection of extractors, transformers and loaders for a variety of news feeds and outlets.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'news_scraper'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install news_scraper
24
+
25
+ ## Usage
26
+
27
+ ### Scraping
28
+
29
+ `NewsScraper::Scraper#scrape` will return an array of the transformed data for all Google News RSS articles for the given query.
30
+
31
+ Optionally, you can pass in a block and it will yield the transformed data on a per-article basis.
32
+
33
+ It takes in 1 parameter `query:`.
34
+
35
+ Array notation
36
+ ```
37
+ article_hashes = NewsScraper::Scraper.new(query: 'Shopify').scrape # [ { author: ... }, { author: ... } ... ]
38
+ ```
39
+
40
+ Block notation
41
+ ```
42
+ NewsScraper::Scraper.new(query: 'Shopify').scrape do |article_hash|
43
+ # { author: ... }
44
+ end
45
+ ```
46
+
47
+ How the `Scraper` extracts and parses for the information is determined by scrape patterns (see **Scrape Patterns**).
48
+
49
+ ### Transformed Data
50
+
51
+ Calling `NewsScraper::Scraper#scrape` with either the array or block notation will yield `transformed_data` hashes. [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml) defines the data types that will be scraped for.
52
+
53
+ In addition, the `uri` and `root_domain`(hostname) of the article will be returned in the hash too.
54
+
55
+ Example
56
+ ```
57
+ {
58
+ author: 'Linus Torvald',
59
+ body: 'The Linux kernel developed by Linus Torvald has become the backbone of most electronic devices we use to-date. It powers mobile phones, laptops, embedded devices, and even rockets...',
60
+ description: 'The Linux kernel is one of the most important contributions to the world of technology.',
61
+ keywords: 'linux,kernel,linus,torvald',
62
+ section: 'technology',
63
+ datetime: '1991-10-05T12:00:00+00:00',
64
+ title: 'Linus Linux',
65
+ uri: 'linusworld.com/the-linux-kernel',
66
+ root_domain: 'linusworld.com'
67
+ }
68
+ ```
69
+
70
+ ### Scrape Patterns
71
+
72
+ Scrape patterns are xpath or CSS patterns used by Nokogiri to extract relevant HTML elements.
73
+
74
+ Extracting each `:data_type` (see Example under **Transformed Data**) requires a scrape pattern. A few `:presets` are specified in [`article_scrape_patterns.yml`](https://github.com/richardwu/news_scraper/blob/master/config/article_scrape_patterns.yml).
75
+
76
+ Since each news site (identified with `:root_domain`) uses a different markup, scrape patterns are defined on a per-`:root_domain` basis.
77
+
78
+ Specifying scrape patterns for new, undefined `:root_domains` is called training (see **Training**).
79
+
80
+ ### Training
81
+
82
+ For each `:root_domain`, it is neccesary to specify a scrape pattern for each of the `:data_type`s. A rake task was written to provide a CLI for appending new `:root_domain`s using `:preset` scrape patterns.
83
+
84
+ Simply run
85
+ ```
86
+ bundle exec rake scraper:train QUERY=<query>
87
+ ```
88
+
89
+ where the CLI will step through the articles and `:root_domain`s of the articles relevant to `<query>`.
90
+
91
+ ## Development
92
+
93
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
94
+
95
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
96
+
97
+ ## Contributing
98
+
99
+ Bug reports and pull requests are welcome on GitHub at https://github.com/richardwu/news_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
100
+
101
+
102
+ ## License
103
+
104
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
105
+
data/Rakefile ADDED
@@ -0,0 +1,24 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require 'rdoc/task'
4
+
5
+ require 'news_scraper'
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.libs << 'test'
9
+ t.pattern = 'test/**/*_test.rb'
10
+ end
11
+
12
+ namespace :scraper do
13
+ desc 'CLI that steps through articles for a given query and displays preset scrape pattern results; parameters: QUERY'
14
+ task :train do
15
+ raise "QUERY param not given.\n\tUsage: bundle exec rake scraper:train QUERY=<query>" unless ENV['QUERY']
16
+ NewsScraper::Trainer.train(query: ENV['QUERY'])
17
+ end
18
+ end
19
+
20
+ RDoc::Task.new do |rdoc|
21
+ rdoc.main = "README.md"
22
+ rdoc.rdoc_files.include("README.md", "lib/*.rb", "lib/**/*.rb")
23
+ rdoc.rdoc_dir = "doc"
24
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "news_scraper"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/circle.yml ADDED
@@ -0,0 +1,3 @@
1
+ test:
2
+ pre:
3
+ - bundle exec rubocop
@@ -0,0 +1,116 @@
1
+ # All domains should include the scrape method/pattern for data_types
2
+
3
+ data_types:
4
+ - "author"
5
+ - "body"
6
+ - "description"
7
+ - "keywords"
8
+ - "section"
9
+ - "datetime"
10
+ - "title"
11
+
12
+ # All data types must include :method and :pattern
13
+ #
14
+ presets:
15
+ author:
16
+ class: &class_author
17
+ method: css
18
+ pattern: ".author"
19
+ id: &id_author
20
+ method: css
21
+ pattern: "#author"
22
+ name: &name_author
23
+ method: css
24
+ pattern: ".author-name"
25
+ link: &link_author
26
+ method: xpath
27
+ pattern: "//a[contains(@href, 'author')]"
28
+ meta: &meta_author
29
+ method: xpath
30
+ pattern: "//meta[@name='author']/@content"
31
+ rel_link: &rel_link_author
32
+ method: xpath
33
+ pattern: "//a[@rel='author']"
34
+ vcard: &vcard_author
35
+ method: css
36
+ pattern: ".vcard .fn"
37
+ body:
38
+ readability: &readability_body
39
+ method: "readability"
40
+ pattern: ""
41
+ description:
42
+ meta: &meta_description
43
+ method: "xpath"
44
+ pattern: "//meta[@name='description']/@content"
45
+ og: &og_description
46
+ method: "xpath"
47
+ pattern: "//meta[@property='og:description']/@content"
48
+ keywords:
49
+ meta: &meta_keywords
50
+ method: "xpath"
51
+ pattern: "//meta[@name='keywords']/@content"
52
+ article_tag: &article_tag_keywords
53
+ method: "xpath"
54
+ pattern: "//meta[@property='article:tag']/@content"
55
+ section:
56
+ meta: &meta_section
57
+ method: "xpath"
58
+ pattern: "//meta[@property='article:section']/@content"
59
+ datetime:
60
+ article_date_original: &article_date_original_datetime
61
+ method: xpath
62
+ pattern: //meta[@name='article_date_original']/@content
63
+ article_published_time: &article_published_time_datetime
64
+ method: "xpath"
65
+ pattern: "//meta[@property='article:published_time']/@content"
66
+ date: &date_datetime
67
+ method: xpath
68
+ pattern: //meta[@name='date']/@content
69
+ date_published: &date_published_datetime
70
+ method: xpath
71
+ pattern: //*[@itemprop='datePublished']/@datetime
72
+ og_published_time: &og_published_time_datetime
73
+ method: xpath
74
+ pattern: //meta[@property='og:published_time']/@content
75
+ original_publication_date: &original_publication_date_datetime
76
+ method: xpath
77
+ pattern: //meta[@name='OriginalPublicationDate']/@content
78
+ publication_date: &publication_date_datetime
79
+ method: xpath
80
+ pattern: //meta[@name='publication_date']/@content
81
+ publish_date: &publish_date_datetime
82
+ method: xpath
83
+ pattern: //meta[@name='PublishDate']/@content
84
+ rnews_date_published: &rnews_date_published_datetime
85
+ method: xpath
86
+ pattern: //meta[@property='rnews:datePublished']/@content
87
+ sailthru_date: &sailthru_date_datetime
88
+ method: xpath
89
+ pattern: //meta[@name='sailthru.date']/@content
90
+ title:
91
+ html: &html_title
92
+ method: "xpath"
93
+ pattern: "//title"
94
+ og: &og_title
95
+ method: "xpath"
96
+ pattern: "//meta[@property='og:title']/@content"
97
+
98
+ domains:
99
+ investors.com:
100
+ author: *rel_link_author
101
+ body:
102
+ method: "css"
103
+ pattern: ".single-post-content"
104
+ description: *og_description
105
+ keywords: *meta_keywords
106
+ section: *meta_section
107
+ datetime: *article_published_time_datetime
108
+ title: *og_title
109
+ fool.com:
110
+ author: *meta_author
111
+ body: *readability_body
112
+ description: *meta_description
113
+ keywords: *article_tag_keywords
114
+ section: *meta_section
115
+ datetime: *date_datetime
116
+ title: *og_title
@@ -0,0 +1,4 @@
1
+ extractors:
2
+ google_news_rss:
3
+ article_urls: "tmp/google_news_rss/article_urls"
4
+
data/dev.yml ADDED
@@ -0,0 +1,13 @@
1
+ name: news-scraper
2
+
3
+ up:
4
+ - ruby: 2.3.1
5
+ - bundler
6
+
7
+ commands:
8
+ rubocop:
9
+ desc: 'Lint the Ruby code with Rubocop'
10
+ run: bundle exec rubocop
11
+ aliases: [rubo, lint, l]
12
+ test: bundle exec rake test
13
+ docs: bundle exec rake rdoc
@@ -0,0 +1,11 @@
1
+ class String
2
+ def squish
3
+ dup.squish!
4
+ end
5
+
6
+ def squish!
7
+ gsub!(/[[:space:]]+/, ' ')
8
+ strip!
9
+ self
10
+ end
11
+ end
@@ -0,0 +1,106 @@
1
+ require 'readline'
2
+
3
+ module NewsScraper
4
+ module CLI
5
+ extend self
6
+
7
+ DEFAULT_COLOR = "\x1b[36m".freeze
8
+
9
+ def log(message, color: DEFAULT_COLOR, new_line: false)
10
+ message += "\n" if new_line
11
+ $stdout.puts "#{color}┃\x1b[0m " + message
12
+ end
13
+
14
+ def log_lines(message, color: DEFAULT_COLOR, new_line: false)
15
+ message.split("\n").each do |line|
16
+ log(line, color: color, new_line: new_line)
17
+ end
18
+ end
19
+
20
+ def confirm(msg, color: DEFAULT_COLOR)
21
+ print "#{color}┃\x1b[0m #{msg} (y/n) "
22
+ $stdin.gets.chomp =~ /[Yy]/
23
+ end
24
+
25
+ def get_input(msg = nil)
26
+ log(msg) if msg
27
+ Readline.completion_append_character = " "
28
+ Readline.completion_proc = nil
29
+ result = begin
30
+ Readline.readline("\x1b[34m┃ > \x1b[33m", true)
31
+ rescue Interrupt
32
+ nil
33
+ end
34
+ print "\e[0m" # reset colour
35
+ result
36
+ end
37
+
38
+ def prompt_with_options(question, options)
39
+ log(question)
40
+ log("Your options are:")
41
+ options.each.with_index(1) do |v, idx|
42
+ log("#{idx}) #{v}")
43
+ end
44
+ log("Choose a number between 1 and #{options.length}")
45
+
46
+ Readline.completion_append_character = " "
47
+ Readline.completion_proc = nil
48
+
49
+ buf = -1
50
+ available = (1..options.length).to_a
51
+ until available.include?(buf.to_i)
52
+ begin
53
+ buf = Readline.readline("\x1b[34m┃ > \x1b[33m", true)
54
+ rescue Interrupt
55
+ nil
56
+ end
57
+
58
+ if buf.nil?
59
+ STDERR.puts
60
+ next
61
+ end
62
+
63
+ buf = buf.chomp
64
+ buf = -1 if buf.empty?
65
+ buf = -1 if buf.to_i.to_s != buf
66
+ end
67
+
68
+ print "\e[0m" # reset colour
69
+ options[buf.to_i - 1]
70
+ end
71
+
72
+ ## Fancy Headers and Footers
73
+
74
+ def put_header(text = "", color = DEFAULT_COLOR)
75
+ put_edge(color, "┏━━ ", text)
76
+ end
77
+
78
+ def put_footer(color = DEFAULT_COLOR)
79
+ put_edge(color, "┗", "")
80
+ end
81
+
82
+ def put_edge(color, prefix, text)
83
+ ptext = "#{color}#{prefix}#{text}"
84
+ textwidth = printing_width(ptext)
85
+
86
+ termwidth = IO.respond_to?(:console) && IO.console ? IO.console.winsize[1] : 80
87
+ termwidth = 30 if termwidth < 30
88
+
89
+ if textwidth > termwidth
90
+ ptext = ptext[0...termwidth]
91
+ textwidth = termwidth
92
+ end
93
+ padwidth = termwidth - textwidth
94
+ pad = "━" * padwidth
95
+ formatted = "#{ptext}#{color}#{pad}\x1b[0m\n"
96
+
97
+ $stdout.puts formatted
98
+ end
99
+
100
+ # ANSI escape sequences (like \x1b[31m) have zero width.
101
+ # when calculating the padding width, we must exclude them.
102
+ def printing_width(str)
103
+ str.gsub(/\x1b\[[\d;]+[A-z]/, '').size
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,6 @@
1
+ module NewsScraper
2
+ module Constants
3
+ SCRAPE_PATTERN_FILEPATH = File.expand_path('../../../config/article_scrape_patterns.yml', __FILE__)
4
+ SCRAPE_PATTERNS = YAML.load_file(SCRAPE_PATTERN_FILEPATH)
5
+ end
6
+ end
@@ -0,0 +1,16 @@
1
+
2
+ module NewsScraper
3
+ class ResponseError < StandardError; end
4
+
5
+ module Transformers
6
+ class ScrapePatternNotDefined < StandardError
7
+ attr_reader :root_domain, :uri
8
+
9
+ def initialize(opts = {})
10
+ @root_domain = opts[:root_domain]
11
+ @uri = opts[:uri]
12
+ super
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ require 'nokogiri'
2
+
3
+ module NewsScraper
4
+ module Extractors
5
+ class Article
6
+ include ExtractorsHelpers
7
+
8
+ def initialize(url:)
9
+ @url = url
10
+ end
11
+
12
+ def extract
13
+ http_request(@url).body
14
+ end
15
+ end
16
+ end
17
+ end