telegraph-parser 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/lib/telegraph/parser/article.rb +9 -9
- data/lib/telegraph/parser/errors.rb +5 -0
- data/lib/telegraph/parser/fetcher.rb +23 -0
- data/lib/telegraph/parser/parser.rb +32 -24
- data/lib/telegraph/parser/version.rb +1 -1
- data/lib/telegraph/parser.rb +2 -1
- data/telegraph-parser.gemspec +1 -0
- metadata +20 -3
- data/lib/telegraph/parser/binding.rb +0 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26cc01f88e7399f22fa4289fbb7d48dd9abb1e42
|
4
|
+
data.tar.gz: 3396ed38a2844d8daf0512107246e2f9cb07fa37
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9275b8c5a759ab28c47dc4d0da3968ba965c9a5bcb8684dbbb7bac626013b348b6649597b57373958d5d9d3a52a59c4f62e1af5b6f2b19455cd16e6b18e0ebe0
|
7
|
+
data.tar.gz: 51ba6f30a59614dcd26670c101bd059b4a6f6655c985fc7b0eaa261c2350be350a5259caa75a29dcfd575d6e9fc7b53c2e272fcce7ede226a946f606486a94a8
|
data/.rspec
ADDED
data/.travis.yml
ADDED
@@ -1,21 +1,21 @@
|
|
1
1
|
module Telegraph
|
2
2
|
module Parser
|
3
3
|
class Article
|
4
|
-
|
4
|
+
ATTRIBUTES = %i[id title author content images].freeze
|
5
|
+
|
6
|
+
attr_reader *ATTRIBUTES
|
5
7
|
|
6
8
|
def initialize(attributes)
|
7
9
|
attributes.each do |attr, val|
|
8
|
-
next unless
|
9
|
-
|
10
|
+
next unless ATTRIBUTES.include?(attr.to_sym)
|
11
|
+
instance_variable_set(:"@#{attr}", val)
|
10
12
|
end
|
11
13
|
end
|
12
14
|
|
13
|
-
def self.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
article.id = article_id
|
18
|
-
end
|
15
|
+
def self.find(article_id, image_prefix: '')
|
16
|
+
parser = Parser.new(article_id, image_prefix)
|
17
|
+
parser.fetch_and_parse!
|
18
|
+
Article.new(parser.parsed_data)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
module Telegraph
|
4
|
+
module Parser
|
5
|
+
class Fetcher
|
6
|
+
HOST = 'telegra.ph'.freeze
|
7
|
+
|
8
|
+
def fetch(article_id)
|
9
|
+
Net::HTTP.get(HOST, "/#{article_id}")
|
10
|
+
rescue EOFError
|
11
|
+
raise Telegraph::Parser::ArticleNotFound
|
12
|
+
end
|
13
|
+
|
14
|
+
def fetch_image(src, prefix)
|
15
|
+
{ image_id(src, prefix) => open(src).read }
|
16
|
+
end
|
17
|
+
|
18
|
+
def image_id(src, prefix)
|
19
|
+
"#{prefix}/#{File.basename(URI.parse(src).path)}"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -3,48 +3,56 @@ require 'nokogiri'
|
|
3
3
|
module Telegraph
|
4
4
|
module Parser
|
5
5
|
class Parser
|
6
|
-
|
6
|
+
attr_reader :parsed_data
|
7
7
|
|
8
|
-
def initialize(
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def initialize(article_id, image_prefix)
|
9
|
+
@fetcher = Fetcher.new
|
10
|
+
@article_id = article_id
|
11
|
+
@image_prefix = image_prefix
|
12
|
+
end
|
13
|
+
|
14
|
+
def fetch_and_parse!
|
15
|
+
load_page!
|
16
|
+
load_images!
|
17
|
+
compile_parsed_data!
|
12
18
|
end
|
13
19
|
|
14
20
|
private
|
15
21
|
|
16
|
-
def
|
17
|
-
@page
|
22
|
+
def load_page!
|
23
|
+
@page = Nokogiri::HTML(@fetcher.fetch(@article_id)).css('article')
|
18
24
|
end
|
19
25
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
26
|
+
def compile_parsed_data!
|
27
|
+
@parsed_data = {
|
28
|
+
article_id: @article_id,
|
29
|
+
title: tag_content(:h1),
|
30
|
+
author: tag_content(:address),
|
31
|
+
content: without_tags,
|
32
|
+
images: @images
|
33
|
+
}
|
27
34
|
end
|
28
35
|
|
29
|
-
def
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
36
|
+
def load_images!
|
37
|
+
@images = {}
|
38
|
+
|
39
|
+
@page.search('.//img').each do |image|
|
40
|
+
i = @fetcher.fetch_image(image.attributes['src'].value, @image_prefix)
|
41
|
+
@images.merge!(i)
|
42
|
+
end
|
35
43
|
end
|
36
44
|
|
37
45
|
def tag_content(tag)
|
38
|
-
page.search(".//#{tag}").text
|
46
|
+
@page.search(".//#{tag}").text
|
39
47
|
end
|
40
48
|
|
41
49
|
def without_tags
|
42
|
-
page_copy = page.dup
|
43
|
-
%i(h1 address).each { |tag| page.search(".//#{tag}").remove }
|
50
|
+
page_copy = @page.dup
|
51
|
+
%i(h1 address).each { |tag| @page.search(".//#{tag}").remove }
|
44
52
|
|
45
53
|
page_copy.search('.//img').each do |image|
|
46
54
|
image.attributes['src'].value =
|
47
|
-
|
55
|
+
@fetcher.image_id(image.attributes['src'].value, @image_prefix)
|
48
56
|
end
|
49
57
|
|
50
58
|
page_copy.remove_attr('id').remove_class.to_s
|
data/lib/telegraph/parser.rb
CHANGED
data/telegraph-parser.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: telegraph-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sergey Vernidub
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 1.6.8
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
description:
|
70
84
|
email:
|
71
85
|
- svernidub@gmail.com
|
@@ -74,6 +88,8 @@ extensions: []
|
|
74
88
|
extra_rdoc_files: []
|
75
89
|
files:
|
76
90
|
- ".gitignore"
|
91
|
+
- ".rspec"
|
92
|
+
- ".travis.yml"
|
77
93
|
- Gemfile
|
78
94
|
- LICENSE.txt
|
79
95
|
- README.md
|
@@ -82,7 +98,8 @@ files:
|
|
82
98
|
- bin/setup
|
83
99
|
- lib/telegraph/parser.rb
|
84
100
|
- lib/telegraph/parser/article.rb
|
85
|
-
- lib/telegraph/parser/
|
101
|
+
- lib/telegraph/parser/errors.rb
|
102
|
+
- lib/telegraph/parser/fetcher.rb
|
86
103
|
- lib/telegraph/parser/parser.rb
|
87
104
|
- lib/telegraph/parser/version.rb
|
88
105
|
- telegraph-parser.gemspec
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'open-uri'
|
2
|
-
|
3
|
-
module Telegraph
|
4
|
-
module Parser
|
5
|
-
class Binding
|
6
|
-
BASE_URI = 'telegra.ph'
|
7
|
-
|
8
|
-
attr_accessor :article_id
|
9
|
-
|
10
|
-
def initialize(article_id)
|
11
|
-
self.article_id = article_id
|
12
|
-
end
|
13
|
-
|
14
|
-
def content
|
15
|
-
Net::HTTP.get(BASE_URI, "/#{article_id}")
|
16
|
-
rescue EOFError
|
17
|
-
""
|
18
|
-
end
|
19
|
-
|
20
|
-
def image(src, prefix)
|
21
|
-
src.gsub!('http://telegra.ph', '')
|
22
|
-
{ image_id(src, prefix) => open("http://#{BASE_URI}#{src}").read }
|
23
|
-
end
|
24
|
-
|
25
|
-
def image_id(src, prefix)
|
26
|
-
src.gsub!('http://telegra.ph', '')
|
27
|
-
"#{prefix}/#{File.basename(URI.parse("#{BASE_URI}#{src}").path)}"
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|