husc 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +57 -12
- data/husc.gemspec +1 -1
- data/lib/husc/version.rb +1 -1
- data/lib/husc.rb +7 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 70da435ce2b15bb485ce958997a91488cf19f00b7faf5da3f25c92a891028508
|
4
|
+
data.tar.gz: b363fa06b547c1a5612889739465af7ea46cbeea6b6922aab0d484fa21169ce6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2190d0269954730626eca1d142c8ffd00614bf98fb19d7ba4d595668e944d769636f26c151211a664ba775b2d9f1c3b5594bbd7d5cde05eec7af31e11f82dfff
|
7
|
+
data.tar.gz: 5f0ea49baf5d2fbf6a707f3b580e95364686c2a3e1439d8c3469c090335790ebeef062176e3d67e3afd7592b23e2256f92f6421a14e4b208c94b4e5a1e737a7f
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
|
1
|
+
Husc
|
2
2
|
=======
|
3
3
|
|
4
|
-
|
4
|
+
A simple crawling utility for Ruby.
|
5
5
|
|
6
6
|
|
7
7
|
## Description
|
@@ -16,29 +16,74 @@ This project enables site crawling and data extraction with xpath and css select
|
|
16
16
|
## Usage
|
17
17
|
### Simple Example
|
18
18
|
```ruby
|
19
|
-
require '
|
19
|
+
require 'husc'
|
20
20
|
|
21
21
|
url = 'http://www.example.com/'
|
22
|
-
doc =
|
22
|
+
doc = Husc(url)
|
23
23
|
|
24
|
-
#
|
24
|
+
# access another url
|
25
|
+
doc.get('another url')
|
26
|
+
|
27
|
+
# get current url
|
28
|
+
doc.url
|
29
|
+
|
30
|
+
# get current site's html
|
31
|
+
doc.html
|
32
|
+
|
33
|
+
# get <table> tags as dict
|
34
|
+
doc.tables
|
35
|
+
# ex) doc.tables['予約・お問い合わせ'] => 050-5596-6465
|
36
|
+
```
|
37
|
+
|
38
|
+
### Scraping Example
|
39
|
+
```ruby
|
40
|
+
# search for nodes by css selector
|
41
|
+
# tag : css('name')
|
42
|
+
# class : css('.name')
|
43
|
+
# id : css('#name')
|
25
44
|
doc.css('div')
|
26
45
|
doc.css('.main-text')
|
27
46
|
doc.css('#tadjs')
|
28
47
|
|
29
|
-
#
|
48
|
+
# search for nodes by xpath
|
30
49
|
doc.xpath('//*[@id="top"]/div[1]')
|
31
50
|
|
32
|
-
#
|
33
|
-
doc.css('div').css('a')[2].attr('href')
|
34
|
-
doc.css('p').innerText()
|
35
|
-
doc.tables # -> Table Tag to Dict
|
36
|
-
|
51
|
+
# other example
|
52
|
+
doc.css('div').css('a')[2].attr('href') # => string object
|
53
|
+
doc.css('p').innerText() # => string object
|
37
54
|
# You do not need to specify "[]" to access the first index
|
38
55
|
```
|
39
56
|
|
57
|
+
### Submitting Form Example
|
58
|
+
1. Specify target node's attribute
|
59
|
+
2. Specify value(int or str) / check(bool) / file_name(str)
|
60
|
+
3. call submit() with form attribute specified
|
61
|
+
```ruby
|
62
|
+
# login
|
63
|
+
doc.send(id:'id attribute', value:'value to send')
|
64
|
+
doc.send(id:'id attribute', value:'value to send')
|
65
|
+
doc.submit(id:'id attribute') # submit
|
66
|
+
|
67
|
+
# post file
|
68
|
+
doc.send(id:'id attribute', file_name:'target file name')
|
69
|
+
|
70
|
+
# checkbox
|
71
|
+
doc.send(id:'id attribute', check:True) # check
|
72
|
+
doc.send(id:'id attribute', check:False) # uncheck
|
73
|
+
|
74
|
+
# example of specify other attribute
|
75
|
+
doc.send(name:'name attribute', value:'hello')
|
76
|
+
doc.send(class:'class attribute', value:100)
|
77
|
+
```
|
78
|
+
|
79
|
+
|
80
|
+
|
40
81
|
|
41
82
|
## Installation
|
42
83
|
```sh
|
43
84
|
$ gem install husc
|
44
|
-
```
|
85
|
+
```
|
86
|
+
|
87
|
+
|
88
|
+
## Contributing
|
89
|
+
Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/PyCrawl](https://github.com/AjxLab/PyCrawl).
|
data/husc.gemspec
CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
14
|
|
15
15
|
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
-
spec.metadata["source_code_uri"] = "https://github.com/AjxLab/
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/AjxLab/husc"
|
17
17
|
|
18
18
|
# Specify which files should be added to the gem when it is released.
|
19
19
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
data/lib/husc/version.rb
CHANGED
data/lib/husc.rb
CHANGED
@@ -3,11 +3,12 @@ require 'mechanize'
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'net/http'
|
5
5
|
require 'kconv'
|
6
|
-
require
|
6
|
+
require 'husc/version'
|
7
7
|
|
8
8
|
module Husc
|
9
9
|
class Error < StandardError; end
|
10
|
-
|
10
|
+
|
11
|
+
class Crawler
|
11
12
|
attr_reader :url, :html, :tables, :params
|
12
13
|
|
13
14
|
# 特殊配列
|
@@ -28,7 +29,7 @@ module Husc
|
|
28
29
|
|
29
30
|
def method_missing(method, *args)
|
30
31
|
if self == []
|
31
|
-
return eval("
|
32
|
+
return eval("Crawler.new(doc: nil).#{method}(*#{args})")
|
32
33
|
end
|
33
34
|
|
34
35
|
return eval("self[0].#{method}(*#{args})")
|
@@ -118,7 +119,7 @@ module Husc
|
|
118
119
|
|
119
120
|
def xpath(locator, single = false)
|
120
121
|
## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
|
121
|
-
elements = CrawlArray.new(@doc.xpath(locator).map {|el|
|
122
|
+
elements = CrawlArray.new(@doc.xpath(locator).map {|el| Crawler.new(doc: el)})
|
122
123
|
if single
|
123
124
|
# シングルノード
|
124
125
|
if elements[0] == nil
|
@@ -134,7 +135,7 @@ module Husc
|
|
134
135
|
|
135
136
|
def css(locator, single = false)
|
136
137
|
## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
|
137
|
-
elements = CrawlArray.new(@doc.css(locator).map {|el|
|
138
|
+
elements = CrawlArray.new(@doc.css(locator).map {|el| Crawler.new(doc: el)})
|
138
139
|
if single
|
139
140
|
# シングルノード
|
140
141
|
if elements[0] == nil
|
@@ -211,3 +212,4 @@ module Husc
|
|
211
212
|
end
|
212
213
|
end
|
213
214
|
end
|
215
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: husc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tatsuya Abe
|
@@ -1077,7 +1077,7 @@ licenses:
|
|
1077
1077
|
- MIT
|
1078
1078
|
metadata:
|
1079
1079
|
homepage_uri: https://github.com/AjxLab/husc
|
1080
|
-
source_code_uri: https://github.com/AjxLab/
|
1080
|
+
source_code_uri: https://github.com/AjxLab/husc
|
1081
1081
|
post_install_message:
|
1082
1082
|
rdoc_options: []
|
1083
1083
|
require_paths:
|