webxtractor 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/webxtractor.rb +28 -11
  3. metadata +15 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2648fcab56f4879a51dbaeb6300a572a71b43ac2
4
- data.tar.gz: ec3d4bb9409bf610379e632912f91640dc0305ce
3
+ metadata.gz: 4747c5d023485e45b20522ea73e097461603ccf7
4
+ data.tar.gz: 3906405e1d7a52d92b7f5075575884dd6e82c1c8
5
5
  SHA512:
6
- metadata.gz: d6f48d46163786d466d87aeec573d932494557dd34ee01d7419a115fdc8ab9176a5a8f60e112c2ce624e3b513414374051fa984ed816a7cadbe4b964fec4fdbb
7
- data.tar.gz: fdba6ee9ee7a7d7b64b16ea0e9e803a83b0883640e5bd964c4b183dd21098f5c9576a9d55d61809dbbeb11d3dd0049ce387b77b110bec40840ea39f844ee8c87
6
+ metadata.gz: 9115456aed09a43ec83061d403f505befcb440840c45aaa84cde35d6c7572f2bf97bca5d45b87727622e671c9c6241b90ca7f5d107432f875b6690bd2a4f8a76
7
+ data.tar.gz: 0d9ed83fa60c302bf23d25dafbfae6c51a417456a9c68e050693e940cd2f2d1eb3ee286aabd68e2b14bd36497bc3f0627569aa7fa370365abb9e9e958db10a94
data/lib/webxtractor.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'ostruct'
2
2
  require 'nokogiri'
3
+ require 'open-uri'
3
4
 
4
5
  class Webxtractor
5
6
  def self.get(url=nil)
@@ -11,23 +12,39 @@ class Webxtractor
11
12
  def self.parse(body)
12
13
  page = Nokogiri::HTML(body)
13
14
  result = OpenStruct.new
14
- result.title = get_tag('title', page)
15
- result.h1 = get_tag('h1', page)
15
+ result.title = get_tag(page, 'title')
16
+ result.meta_description = get_tag(page,
17
+ 'meta[name=description]',
18
+ attribute: "content")
19
+ result.meta_keywords = get_tag(page,
20
+ 'meta[name=keywords]',
21
+ attribute: "content")
22
+ result.h1 = get_tag(page, 'h1')
16
23
  result
17
24
  end
18
25
 
19
- def self.get_tag(selector, page)
20
- element = page.css(selector)
21
- if element.size > 1
22
- element.map {|x| normalize(x.text) }
26
+ def self.get_tag(page, selector, attribute: nil)
27
+ elements = page.css(selector)
28
+ if elements.size > 1
29
+ elements.map {|element| get_content(element, attribute) }
23
30
  else
24
- normalize(element.text)
31
+ get_content(elements.first, attribute)
25
32
  end
26
33
  end
27
34
 
28
- def self.normalize(content='')
29
- return if content.nil?
30
- content.gsub(/(\r\n|\n|\r)/," ")
31
- content.gsub(/\s+/, " ").strip
35
+ def self.normalize(text=nil)
36
+ return if text.nil?
37
+ text.gsub(/(\r\n|\n|\r)/," ")
38
+ text.gsub(/\s+/, " ").strip
39
+ end
40
+
41
+ def self.get_content(element, attribute)
42
+ return if element.nil?
43
+ text = if element.attributes[attribute].respond_to?(:value)
44
+ element.attributes[attribute].value
45
+ else
46
+ element.text
47
+ end
48
+ normalize(text)
32
49
  end
33
50
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webxtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - schmierkov
@@ -50,6 +50,20 @@ dependencies:
50
50
  - - ">="
51
51
  - !ruby/object:Gem::Version
52
52
  version: 3.3.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: rspec
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '3.4'
60
+ type: :development
61
+ prerelease: false
62
+ version_requirements: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '3.4'
53
67
  description: A simple content extractor
54
68
  email: github@schmierkov.de
55
69
  executables: []