burly 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +28 -2
- data/burly.gemspec +2 -2
- data/lib/burly/parser.rb +2 -2
- data/lib/burly/parsers/html_parser.rb +15 -2
- data/lib/burly.rb +17 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03d52fe37c2dc18c93b9e2bf74d149760264f2bb4014a0d9cc54fae78d3e1b63
|
4
|
+
data.tar.gz: 5e984dbdd20148684b992dfa19ea3638fc6218abcb451db556422e9f614e7ddb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 837debbc2f3ddcf3b4c53663e6a0a5100ce96d89adc9fd5ce160a6c93f1d6ac38026ba00567e1e2c07920902bc63c2435240c61b4a2c975826a45168ce669996
|
7
|
+
data.tar.gz: da0af5413e7c00d4903d536db24bf64fc90ad2beea353cb1747a0455d3ef68bf98f42fe7847dfee651190154c47cb6cc37414b00213ff4d3c45ef2f9a7e21d1e
|
data/README.md
CHANGED
@@ -33,9 +33,9 @@ Burly.parse(File.read("example.txt"))
|
|
33
33
|
Parsing JSON or HTML documents is only slightly more complicated:
|
34
34
|
|
35
35
|
```ruby
|
36
|
-
Burly.parse(File.read("example.
|
36
|
+
Burly.parse(File.read("example.html"), mime_type: "text/html")
|
37
37
|
|
38
|
-
Burly.parse(File.read("example.
|
38
|
+
Burly.parse(File.read("example.json"), mime_type: "application/json")
|
39
39
|
```
|
40
40
|
|
41
41
|
Burly uses _slightly_ different parsing rules for each supported MIME type:
|
@@ -46,6 +46,32 @@ Burly uses _slightly_ different parsing rules for each supported MIME type:
|
|
46
46
|
|
47
47
|
In all cases, neither order nor uniqueness is guaranteed. You may also consider converting relative URLs extract from HTML documents to absolute URLs using the document's source URL and/or the `<base>` element's `href` attribute value (Ruby's [`URI.join` class method](https://docs.ruby-lang.org/en/master/URI.html#method-c-join) is good for this!).
|
48
48
|
|
49
|
+
## Parser Options
|
50
|
+
|
51
|
+
Burly's HTML parser supports a single option, `context`, which accepts either a String or an Array of Strings. The values may be either CSS or XPath selectors
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
Burly.parse(File.read("example.html"), context: "main", mime_type: "text/html")
|
55
|
+
|
56
|
+
Burly.parse(File.read("example.html"), context: ["//main", "//div"], mime_type: "text/html")
|
57
|
+
```
|
58
|
+
|
59
|
+
In all cases, Burly will search for nodes matching the provided selector(s) and use the _first_ match as the context within which to search for URLs. The `context` option is a great way to refine the list of extracted URLs based on their presence within the source document.
|
60
|
+
|
61
|
+
> [!NOTE]
|
62
|
+
> If Burly can't locate a node matching the provided selector(s), the context is reset to the document root.
|
63
|
+
|
64
|
+
> [!TIP]
|
65
|
+
> Passing an Array of Strings can be used to achieve an effect similar to conditional logic with fallback behavior.
|
66
|
+
>
|
67
|
+
> ```ruby
|
68
|
+
> require "net/http"
|
69
|
+
>
|
70
|
+
> response = Net::HTTP.get(URI.parse("https://jgarber.example"))
|
71
|
+
>
|
72
|
+
> Burly.parse(response, context: [".h-entry .e-content", ".h-entry", "body"], mime_type: "text/html")
|
73
|
+
> ```
|
74
|
+
|
49
75
|
## License
|
50
76
|
|
51
77
|
Burly is freely available under the [MIT License](https://opensource.org/license/MIT).
|
data/burly.gemspec
CHANGED
@@ -4,7 +4,7 @@ Gem::Specification.new do |spec|
|
|
4
4
|
spec.required_ruby_version = ">= 2.6"
|
5
5
|
|
6
6
|
spec.name = "burly"
|
7
|
-
spec.version = "0.
|
7
|
+
spec.version = "0.2.0"
|
8
8
|
spec.authors = ["Jason Garber"]
|
9
9
|
spec.email = ["jason@sixtwothree.org"]
|
10
10
|
|
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
"documentation_uri" => "https://rubydoc.info/gems/#{spec.name}/#{spec.version}",
|
26
26
|
"homepage_uri" => spec.homepage,
|
27
27
|
"rubygems_mfa_required" => "true",
|
28
|
-
"source_code_uri" => "#{spec.homepage}/
|
28
|
+
"source_code_uri" => "#{spec.homepage}/src/tag/v#{spec.version}",
|
29
29
|
}
|
30
30
|
|
31
31
|
spec.add_dependency "nokogiri", ">= 1.13"
|
data/lib/burly/parser.rb
CHANGED
@@ -33,9 +33,17 @@ module Burly
|
|
33
33
|
|
34
34
|
ATTRIBUTES_XPATHS =
|
35
35
|
URL_ATTRIBUTES_MAP.merge(SRCSET_ATTRIBUTES_MAP).flat_map do |attribute, names|
|
36
|
-
names.map { |name| "
|
36
|
+
names.map { |name| ".//#{name} / @#{attribute}" }
|
37
37
|
end
|
38
38
|
|
39
|
+
# @param document (see Burly.parse)
|
40
|
+
# @param context [String, Array<String>]
|
41
|
+
def initialize(document, context: nil)
|
42
|
+
@context = context
|
43
|
+
|
44
|
+
super
|
45
|
+
end
|
46
|
+
|
39
47
|
# Parse an HTML document for absolute or relative URLs.
|
40
48
|
#
|
41
49
|
# @return [Array<String>]
|
@@ -53,7 +61,12 @@ module Burly
|
|
53
61
|
|
54
62
|
# @return [Nokogiri::XML::NodeSet]
|
55
63
|
def attr_nodes
|
56
|
-
@attr_nodes ||=
|
64
|
+
@attr_nodes ||= context_node.xpath(*ATTRIBUTES_XPATHS)
|
65
|
+
end
|
66
|
+
|
67
|
+
# @return [Nokogiri::HTML5::Document, Nokogiri::XML::Element]
|
68
|
+
def context_node
|
69
|
+
@context_node ||= doc.search(*Array(@context)).first || doc
|
57
70
|
end
|
58
71
|
|
59
72
|
# @return [Nokogiri::HTML5::Document]
|
data/lib/burly.rb
CHANGED
@@ -16,15 +16,29 @@ module Burly
|
|
16
16
|
attr_reader :registered_parsers
|
17
17
|
end
|
18
18
|
|
19
|
-
#
|
19
|
+
# Parse a document for URLs.
|
20
|
+
#
|
21
|
+
# @example Parse a plaintext document.
|
22
|
+
# Burly.parse(File.read("example.txt"))
|
23
|
+
#
|
24
|
+
# @example Parse an HTML document
|
25
|
+
# Burly.parse(File.read("example.html", mime_type: "text/html"))
|
26
|
+
#
|
27
|
+
# @example Parse a JSON document.
|
28
|
+
# Burly.parse(File.read("example.json"), mime_type: "application/json")
|
29
|
+
#
|
30
|
+
# @param document [String] The document to parse for URLs.
|
31
|
+
#
|
32
|
+
# @raise [UnsupportedMimeType]
|
33
|
+
# Raised when an unsupported MIME type is passed as an option.
|
20
34
|
#
|
21
35
|
# @return [Array<String>]
|
22
|
-
def self.parse(document, mime_type: "text/plain")
|
36
|
+
def self.parse(document, mime_type: "text/plain", **options)
|
23
37
|
parser = registered_parsers[mime_type]
|
24
38
|
|
25
39
|
raise UnsupportedMimeType unless parser
|
26
40
|
|
27
|
-
parser.new(document).parse
|
41
|
+
parser.new(document, **options).parse
|
28
42
|
end
|
29
43
|
|
30
44
|
# @api private
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: burly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason Garber
|
@@ -44,11 +44,11 @@ licenses:
|
|
44
44
|
- MIT
|
45
45
|
metadata:
|
46
46
|
bug_tracker_uri: https://codeberg.org/jgarber/burly/issues
|
47
|
-
changelog_uri: https://codeberg.org/jgarber/burly/releases/tag/v0.
|
48
|
-
documentation_uri: https://rubydoc.info/gems/burly/0.
|
47
|
+
changelog_uri: https://codeberg.org/jgarber/burly/releases/tag/v0.2.0
|
48
|
+
documentation_uri: https://rubydoc.info/gems/burly/0.2.0
|
49
49
|
homepage_uri: https://codeberg.org/jgarber/burly
|
50
50
|
rubygems_mfa_required: 'true'
|
51
|
-
source_code_uri: https://codeberg.org/jgarber/burly/
|
51
|
+
source_code_uri: https://codeberg.org/jgarber/burly/src/tag/v0.2.0
|
52
52
|
rdoc_options: []
|
53
53
|
require_paths:
|
54
54
|
- lib
|