feedstock 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +147 -2
- data/feedstock.gemspec +2 -2
- data/lib/feedstock.rb +40 -4
- data/lib/feedstock/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 60dc0bcb05928b59220fe1ed6ac24487428ceef5279f454bce047d3b3a94a56d
|
4
|
+
data.tar.gz: 91d7a161cdd3aedaf2316082b9f5bbae2fa48dcfd9d599cd65ee746673afc2b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4513cbec520821710ed756544b1a1f7797498a5769395333e19a96c39a466cf83291863944973020ad33a9adb21b58ae2370c3bf08a72843683df1025432fc7c
|
7
|
+
data.tar.gz: 327995486920781894858903f4d510b958ee85b04a9aebf7139a4934f73c8f5af446b8eda6a06bf0e158692c4de278ebfc007b7f729f6bca953c67ea8eaff432
|
data/README.md
CHANGED
@@ -21,6 +21,150 @@ Feedstock is available as a gem:
|
|
21
21
|
$ gem install feedstock
|
22
22
|
```
|
23
23
|
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
Feedstock extracts information from a given document using a collection of
|
27
|
+
_rules_.
|
28
|
+
|
29
|
+
A collection of rules is expressed as a hash. The hash has two mandatory keys
|
30
|
+
and one optional key.
|
31
|
+
|
32
|
+
### Info
|
33
|
+
|
34
|
+
The `"info"` key is mandatory. It must be associated with a hash. This document
|
35
|
+
refers to this hash as the 'info hash'.
|
36
|
+
|
37
|
+
#### Keys
|
38
|
+
|
39
|
+
The keys in the info hash are strings (not symbols). When used with the default
|
40
|
+
template, Feedstock will use the key as the name of the XML entity in the
|
41
|
+
resulting feed. For example, if the key is `"id"`, the XML entity in the
|
42
|
+
resulting feed will be `<id>`.
|
43
|
+
|
44
|
+
#### Values
|
45
|
+
|
46
|
+
The value associated with each key in the info hash can be either a string or a
|
47
|
+
hash.
|
48
|
+
|
49
|
+
##### String
|
50
|
+
|
51
|
+
If the value is a string, this defines a path to a node in the document. The
|
52
|
+
path is expressed using CSS's selector syntax. Although a CSS selector can match
|
53
|
+
more than one node, when used in the info hash, a path will only match the first
|
54
|
+
matching node in the document.
|
55
|
+
|
56
|
+
##### Hash
|
57
|
+
|
58
|
+
If the value is a hash, this is the 'data hash'. The data hash defines the
|
59
|
+
rules that Feedstock uses to extract data. It must contain one of two keys:
|
60
|
+
|
61
|
+
- `"literal"`: The value associated with this key is used for the content of the
|
62
|
+
XML entity. This can be useful for elements that are not on the page or that
|
63
|
+
don't change.
|
64
|
+
|
65
|
+
- `"path"`: The path to the node in the document expressed in CSS's selector
|
66
|
+
syntax. As noted above, if the value of a key in the info hash is a string,
|
67
|
+
this is treated as a path. The reason to use a data hash with a `"path"` key
|
68
|
+
is when using one or more of the keys below. In the info hash, a path matches
|
69
|
+
only the first matching node in the document.
|
70
|
+
|
71
|
+
The following keys may also be defined in a data hash:
|
72
|
+
|
73
|
+
- `"attribute"`: The default is `nil`. If an attribute is provided, Feedstock
|
74
|
+
will extract the content of the attribute rather than the content of the node.
|
75
|
+
This is important for links, where the link itself is typically the content of
|
76
|
+
the `href` attribute rather than the content of the `<a>` element.
|
77
|
+
|
78
|
+
- `"prefix"`: The default is `nil`. If a prefix is provided, the string value of
|
79
|
+
the prefix is appended to the beginning of the content extracted.
|
80
|
+
|
81
|
+
- `"suffix"`: The default is `nil`. If a suffix is provided, the string value of
|
82
|
+
the suffix is appended to the end of the content extracted.
|
83
|
+
|
84
|
+
- `"type"`: The default is `nil`. This causes Feedstock to extract only the text
|
85
|
+
in a node (stripping out all HTML). However, a user may specify `"datetime"`
|
86
|
+
or `"cdata"`. `"datetime"` content is parsed by [the Timeliness
|
87
|
+
library][Timeliness] (this is bundled with Feedstock) to return a string.
|
88
|
+
`"cdata"` content includes any HTML and is wrapped in `<![CDATA[` and `]]>`
|
89
|
+
tags.
|
90
|
+
|
91
|
+
[Timeliness]: https://github.com/adzap/timeliness "The official repository for
|
92
|
+
the Timeliness library"
|
93
|
+
|
94
|
+
### Entry
|
95
|
+
|
96
|
+
The `"entry"` key is mandatory. It must be associated with a hash. This document
|
97
|
+
refers to this hash as the 'entry hash'.
|
98
|
+
|
99
|
+
#### Keys
|
100
|
+
|
101
|
+
The keys in the entry hash are strings (not symbols). When used with the default
|
102
|
+
template, Feedstock will use the key as the name of the XML entity in the
|
103
|
+
resulting feed. For example, if the key is `"id"`, the XML entity in the
|
104
|
+
resulting feed will be `<id>`.
|
105
|
+
|
106
|
+
#### Values
|
107
|
+
|
108
|
+
The value associated with each key in the entry hash can be either a string or a
|
109
|
+
hash.
|
110
|
+
|
111
|
+
##### String
|
112
|
+
|
113
|
+
If the value is a string, this defines a path to a node in the document. The
|
114
|
+
path is expressed using CSS's selector syntax. Unlike with the info hash, a
|
115
|
+
the CSS selector will match all nodes.
|
116
|
+
|
117
|
+
##### Hash
|
118
|
+
|
119
|
+
If the value is a hash, we call this the "data hash". The data hash defines the
|
120
|
+
rules that Feedstock uses to extract data. It must contain one of two keys:
|
121
|
+
|
122
|
+
- `"literal"`: The value associated with this key is used for the content of the
|
123
|
+
XML entity. This can be useful for elements that are not on the page or that
|
124
|
+
don't change.
|
125
|
+
|
126
|
+
- `"path"`: The path to the node in the document expressed in CSS's selector
|
127
|
+
syntax. Unlike with the info hash, the CSS selector will match all nodes.
|
128
|
+
|
129
|
+
The following keys may also be defined in a data hash:
|
130
|
+
|
131
|
+
- `"attribute"`: The default is `nil`. If an attribute is provided, Feedstock
|
132
|
+
will extract the content of the attribute rather than the content of the node.
|
133
|
+
This is important for links, where the link itself is typically the content of
|
134
|
+
the `href` attribute rather than the content of the `<a>` element.
|
135
|
+
|
136
|
+
- `"infix"`: The default is `nil`. If the entries hash has been provided (see
|
137
|
+
below), then the string value of the infix is inserted between the content of
|
138
|
+
each matching node. If the entries hash not been provided, this is ignored.
|
139
|
+
|
140
|
+
- `"prefix"`: The default is `nil`. If a prefix is provided, the string value of
|
141
|
+
the prefix is appended to the beginning of the content extracted.
|
142
|
+
|
143
|
+
- `"repeat"`: The default is `nil`. If repeat is set to `true`, Feedstock will
|
144
|
+
use the content provided by either `"literal"` or `"path"` repeatedly. Since
|
145
|
+
the value of `"literal"` implies `"repeat"`, it is not necessary to specify it
|
146
|
+
expressly.
|
147
|
+
|
148
|
+
- `"suffix"`: The default is `nil`. If a suffix is provided, the string value of
|
149
|
+
the suffix is appended to the end of the content extracted.
|
150
|
+
|
151
|
+
- `"type"`: The default is `nil`. This causes Feedstock to extract only the text
|
152
|
+
in a node (stripping out all HTML). However, a user may specify `"datetime"`
|
153
|
+
or `"cdata"`. `"datetime"` content is parsed by [the Timeliness
|
154
|
+
library][Timeliness] (this is bundled with Feedstock) to return a string.
|
155
|
+
`"cdata"` content includes any HTML and is wrapped in `<![CDATA[` and `]]>`
|
156
|
+
tags.
|
157
|
+
|
158
|
+
### Entries
|
159
|
+
|
160
|
+
The `"entries"` key is optional. It can be associated with a hash. This document
|
161
|
+
refers to this hash as the 'entries hash'.
|
162
|
+
|
163
|
+
If an entries hash is provided, it must contain the following key:
|
164
|
+
|
165
|
+
- `"path"`: The path to the node in the document expressed in CSS's selector
|
166
|
+
syntax. This path is used as the root for the paths in the entry hash.
|
167
|
+
|
24
168
|
## Bugs
|
25
169
|
|
26
170
|
Found a bug? I'd love to know about it. The best way is to report them in the
|
@@ -30,12 +174,13 @@ Found a bug? I'd love to know about it. The best way is to report them in the
|
|
30
174
|
|
31
175
|
## Versioning
|
32
176
|
|
33
|
-
|
177
|
+
Feedstock uses [Semantic Versioning 2.0.0][sv2].
|
34
178
|
|
35
179
|
[sv2]: http://semver.org/
|
36
180
|
|
37
181
|
## Licence
|
38
182
|
|
39
|
-
|
183
|
+
Feedstock is released into the public domain. See [LICENSE.md][lc] for more
|
184
|
+
details.
|
40
185
|
|
41
186
|
[lc]: https://github.com/pyrmont/feedstock/blob/master/LICENSE.md
|
data/feedstock.gemspec
CHANGED
@@ -19,13 +19,13 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.files = Dir["Gemfile", "default.xml", "LICENSE", "README.md",
|
20
20
|
"feedstock.gemspec", "lib/feedstock.rb", "lib/**/*.rb"]
|
21
21
|
s.require_paths = ["lib"]
|
22
|
-
|
22
|
+
|
23
23
|
s.metadata["allowed_push_host"] = "https://rubygems.org"
|
24
24
|
|
25
25
|
s.add_runtime_dependency "nokogiri"
|
26
26
|
s.add_runtime_dependency "timeliness"
|
27
27
|
|
28
|
-
s.add_development_dependency "minitest"
|
28
|
+
s.add_development_dependency "minitest"
|
29
29
|
s.add_development_dependency "rake"
|
30
30
|
s.add_development_dependency "warning"
|
31
31
|
end
|
data/lib/feedstock.rb
CHANGED
@@ -22,14 +22,22 @@ module Feedstock
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.download_page(url)
|
25
|
-
Nokogiri::HTML open(url)
|
25
|
+
Nokogiri::HTML URI.open(url)
|
26
26
|
end
|
27
27
|
|
28
28
|
def self.extract_entries(page, rules)
|
29
|
+
if rules["entries"]
|
30
|
+
extract_entries_wrapped page, rules
|
31
|
+
else
|
32
|
+
extract_entries_unwrapped page, rules
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.extract_entries_unwrapped(page, rules)
|
29
37
|
static = Hash.new
|
30
38
|
entries = Array.new
|
31
39
|
|
32
|
-
rules[
|
40
|
+
rules["entry"].each do |name, rule|
|
33
41
|
if rule["literal"]
|
34
42
|
static[name] = rule["literal"]
|
35
43
|
elsif rule["repeat"]
|
@@ -49,6 +57,28 @@ module Feedstock
|
|
49
57
|
entries
|
50
58
|
end
|
51
59
|
|
60
|
+
def self.extract_entries_wrapped(page, rules)
|
61
|
+
entries = Array.new
|
62
|
+
|
63
|
+
page.css(rules["entries"]["path"]).each.with_index do |node, i|
|
64
|
+
rules["entry"].each do |name, rule|
|
65
|
+
entries[i] = Hash.new if entries[i].nil?
|
66
|
+
|
67
|
+
content = if rule["literal"]
|
68
|
+
rule["literal"]
|
69
|
+
elsif rule["repeat"]
|
70
|
+
format_content page.at_css(rule["path"]), rule
|
71
|
+
else
|
72
|
+
format_content node.at_css(rule["path"]), rule
|
73
|
+
end
|
74
|
+
|
75
|
+
entries[i].merge!({ name => content })
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
entries
|
80
|
+
end
|
81
|
+
|
52
82
|
def self.extract_info(page, rules)
|
53
83
|
info = Hash.new
|
54
84
|
|
@@ -84,8 +114,14 @@ module Feedstock
|
|
84
114
|
|
85
115
|
def self.normalise_rules(rules)
|
86
116
|
rules.keys.each do |category|
|
87
|
-
|
88
|
-
|
117
|
+
case category
|
118
|
+
when "info", "entry"
|
119
|
+
rules[category].each do |name, rule|
|
120
|
+
rules[category][name] = { "path" => rule } unless rule.is_a? Hash
|
121
|
+
end
|
122
|
+
when "entries"
|
123
|
+
rule = rules[category]
|
124
|
+
rules[category] = { "path" => rule } unless rule.is_a? Hash
|
89
125
|
end
|
90
126
|
end
|
91
127
|
|
data/lib/feedstock/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedstock
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Camilleri
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -115,7 +115,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
115
115
|
- !ruby/object:Gem::Version
|
116
116
|
version: '0'
|
117
117
|
requirements: []
|
118
|
-
rubygems_version: 3.
|
118
|
+
rubygems_version: 3.1.2
|
119
119
|
signing_key:
|
120
120
|
specification_version: 4
|
121
121
|
summary: A library for creating RSS feeds from webpages
|