webxtractor 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/webxtractor.rb +28 -11
  3. metadata +15 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2648fcab56f4879a51dbaeb6300a572a71b43ac2
4
- data.tar.gz: ec3d4bb9409bf610379e632912f91640dc0305ce
3
+ metadata.gz: 4747c5d023485e45b20522ea73e097461603ccf7
4
+ data.tar.gz: 3906405e1d7a52d92b7f5075575884dd6e82c1c8
5
5
  SHA512:
6
- metadata.gz: d6f48d46163786d466d87aeec573d932494557dd34ee01d7419a115fdc8ab9176a5a8f60e112c2ce624e3b513414374051fa984ed816a7cadbe4b964fec4fdbb
7
- data.tar.gz: fdba6ee9ee7a7d7b64b16ea0e9e803a83b0883640e5bd964c4b183dd21098f5c9576a9d55d61809dbbeb11d3dd0049ce387b77b110bec40840ea39f844ee8c87
6
+ metadata.gz: 9115456aed09a43ec83061d403f505befcb440840c45aaa84cde35d6c7572f2bf97bca5d45b87727622e671c9c6241b90ca7f5d107432f875b6690bd2a4f8a76
7
+ data.tar.gz: 0d9ed83fa60c302bf23d25dafbfae6c51a417456a9c68e050693e940cd2f2d1eb3ee286aabd68e2b14bd36497bc3f0627569aa7fa370365abb9e9e958db10a94
data/lib/webxtractor.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'ostruct'
2
2
  require 'nokogiri'
3
+ require 'open-uri'
3
4
 
4
5
  class Webxtractor
5
6
  def self.get(url=nil)
@@ -11,23 +12,39 @@ class Webxtractor
11
12
  def self.parse(body)
12
13
  page = Nokogiri::HTML(body)
13
14
  result = OpenStruct.new
14
- result.title = get_tag('title', page)
15
- result.h1 = get_tag('h1', page)
15
+ result.title = get_tag(page, 'title')
16
+ result.meta_description = get_tag(page,
17
+ 'meta[name=description]',
18
+ attribute: "content")
19
+ result.meta_keywords = get_tag(page,
20
+ 'meta[name=keywords]',
21
+ attribute: "content")
22
+ result.h1 = get_tag(page, 'h1')
16
23
  result
17
24
  end
18
25
 
19
- def self.get_tag(selector, page)
20
- element = page.css(selector)
21
- if element.size > 1
22
- element.map {|x| normalize(x.text) }
26
+ def self.get_tag(page, selector, attribute: nil)
27
+ elements = page.css(selector)
28
+ if elements.size > 1
29
+ elements.map {|element| get_content(element, attribute) }
23
30
  else
24
- normalize(element.text)
31
+ get_content(elements.first, attribute)
25
32
  end
26
33
  end
27
34
 
28
- def self.normalize(content='')
29
- return if content.nil?
30
- content.gsub(/(\r\n|\n|\r)/," ")
31
- content.gsub(/\s+/, " ").strip
35
+ def self.normalize(text=nil)
36
+ return if text.nil?
37
+ text.gsub(/(\r\n|\n|\r)/," ")
38
+ text.gsub(/\s+/, " ").strip
39
+ end
40
+
41
+ def self.get_content(element, attribute)
42
+ return if element.nil?
43
+ text = if element.attributes[attribute].respond_to?(:value)
44
+ element.attributes[attribute].value
45
+ else
46
+ element.text
47
+ end
48
+ normalize(text)
32
49
  end
33
50
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webxtractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - schmierkov
@@ -50,6 +50,20 @@ dependencies:
50
50
  - - ">="
51
51
  - !ruby/object:Gem::Version
52
52
  version: 3.3.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: rspec
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '3.4'
60
+ type: :development
61
+ prerelease: false
62
+ version_requirements: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - "~>"
65
+ - !ruby/object:Gem::Version
66
+ version: '3.4'
53
67
  description: A simple content extractor
54
68
  email: github@schmierkov.de
55
69
  executables: []