webxtractor 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/webxtractor.rb +28 -11
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4747c5d023485e45b20522ea73e097461603ccf7
|
4
|
+
data.tar.gz: 3906405e1d7a52d92b7f5075575884dd6e82c1c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9115456aed09a43ec83061d403f505befcb440840c45aaa84cde35d6c7572f2bf97bca5d45b87727622e671c9c6241b90ca7f5d107432f875b6690bd2a4f8a76
|
7
|
+
data.tar.gz: 0d9ed83fa60c302bf23d25dafbfae6c51a417456a9c68e050693e940cd2f2d1eb3ee286aabd68e2b14bd36497bc3f0627569aa7fa370365abb9e9e958db10a94
|
data/lib/webxtractor.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'ostruct'
|
2
2
|
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
3
4
|
|
4
5
|
class Webxtractor
|
5
6
|
def self.get(url=nil)
|
@@ -11,23 +12,39 @@ class Webxtractor
|
|
11
12
|
def self.parse(body)
|
12
13
|
page = Nokogiri::HTML(body)
|
13
14
|
result = OpenStruct.new
|
14
|
-
result.title = get_tag('title'
|
15
|
-
result.
|
15
|
+
result.title = get_tag(page, 'title')
|
16
|
+
result.meta_description = get_tag(page,
|
17
|
+
'meta[name=description]',
|
18
|
+
attribute: "content")
|
19
|
+
result.meta_keywords = get_tag(page,
|
20
|
+
'meta[name=keywords]',
|
21
|
+
attribute: "content")
|
22
|
+
result.h1 = get_tag(page, 'h1')
|
16
23
|
result
|
17
24
|
end
|
18
25
|
|
19
|
-
def self.get_tag(selector,
|
20
|
-
|
21
|
-
if
|
22
|
-
|
26
|
+
def self.get_tag(page, selector, attribute: nil)
|
27
|
+
elements = page.css(selector)
|
28
|
+
if elements.size > 1
|
29
|
+
elements.map {|element| get_content(element, attribute) }
|
23
30
|
else
|
24
|
-
|
31
|
+
get_content(elements.first, attribute)
|
25
32
|
end
|
26
33
|
end
|
27
34
|
|
28
|
-
def self.normalize(
|
29
|
-
return if
|
30
|
-
|
31
|
-
|
35
|
+
def self.normalize(text=nil)
|
36
|
+
return if text.nil?
|
37
|
+
text.gsub(/(\r\n|\n|\r)/," ")
|
38
|
+
text.gsub(/\s+/, " ").strip
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.get_content(element, attribute)
|
42
|
+
return if element.nil?
|
43
|
+
text = if element.attributes[attribute].respond_to?(:value)
|
44
|
+
element.attributes[attribute].value
|
45
|
+
else
|
46
|
+
element.text
|
47
|
+
end
|
48
|
+
normalize(text)
|
32
49
|
end
|
33
50
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webxtractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- schmierkov
|
@@ -50,6 +50,20 @@ dependencies:
|
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 3.3.0
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: rspec
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '3.4'
|
60
|
+
type: :development
|
61
|
+
prerelease: false
|
62
|
+
version_requirements: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - "~>"
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '3.4'
|
53
67
|
description: A simple content extractor
|
54
68
|
email: github@schmierkov.de
|
55
69
|
executables: []
|