juxtaparssionate 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/README.rdoc +5 -0
- data/lib/parser.rb +43 -0
- data/lib/scraper.rb +23 -0
- data/lib/version.rb +1 -0
- data/test/files/job_posting.txt +1 -0
- data/test/test_parser.rb +32 -0
- metadata +122 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
---
|
|
2
|
+
!binary "U0hBMQ==":
|
|
3
|
+
metadata.gz: !binary |-
|
|
4
|
+
ZTY1Y2U2NGMzZjBmNGFhM2I2ZWE5M2VmMmZjOGJiNzI3YTY1M2Q0ZQ==
|
|
5
|
+
data.tar.gz: !binary |-
|
|
6
|
+
OTg1MTFjNGI2NDEyZDM2ZTU3OWFjNGY2YzhjN2Q3YTE1ZjM4ZjI1OQ==
|
|
7
|
+
SHA512:
|
|
8
|
+
metadata.gz: !binary |-
|
|
9
|
+
ZTNkODg3MGYyZGM0OGM3OTI3MzVkNGZmZTk5MTA5OTQzMjYxYmZjMmE5NzFm
|
|
10
|
+
OTEyMGRkOGViMTgwN2Q0YWIxMDhiNmM5ZjUzYmI2NjM1OTFjNzg5MTAwYjQ0
|
|
11
|
+
M2U1ZmFlN2NlZTU4MzAyMmJiNzZmZGQzNzc3ZWEyYzhkYmQ1Y2U=
|
|
12
|
+
data.tar.gz: !binary |-
|
|
13
|
+
ZGQ1YjdlMGE5MmUyNTA5MzVlOTcwZGFiMzVmMzRkMGZiNjFkZGY3NzY3YTNh
|
|
14
|
+
OGI1YjVlYjk0MmMzMDFkMjY4YzZmMjAwMDRjMTkzYWUxNjU5ZWZjN2E2YzA1
|
|
15
|
+
YzM0Y2UxNjlmMDg3NzRkMDhhNjRkM2VjMjc3ZTA4NThhY2E5NjU=
|
data/README.rdoc
ADDED
data/lib/parser.rb
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
require 'sanitize'
|
|
2
|
+
require 'treat'
|
|
3
|
+
|
|
4
|
+
include Treat::Core::DSL
|
|
5
|
+
|
|
6
|
+
class Parser
|
|
7
|
+
attr_accessor :contents
|
|
8
|
+
|
|
9
|
+
def initialize file_path
|
|
10
|
+
@contents = document "#{file_path}"
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def strip_tags
|
|
14
|
+
stripped = Sanitize.fragment @contents
|
|
15
|
+
stripped.split.join(' ')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def extract_nouns
|
|
19
|
+
words = @contents.apply(:chunk, :segment, :tokenize, :category)
|
|
20
|
+
words.nouns.map { |w| w.to_s }
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def extract_words
|
|
24
|
+
chunked = strip_tags.apply(:chunk, :segment, :tokenize)
|
|
25
|
+
chunked.map { |w| w.to_s }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def count_words array
|
|
29
|
+
@word_count = {}
|
|
30
|
+
array.each do |word|
|
|
31
|
+
if @word_count[word] != nil
|
|
32
|
+
@word_count[word] += 1
|
|
33
|
+
else
|
|
34
|
+
@word_count[word] = 1
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
@word_count
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def order_hash hash
|
|
41
|
+
hash.sort_by { |key, value| value }
|
|
42
|
+
end
|
|
43
|
+
end
|
data/lib/scraper.rb
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'mechanize'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
|
|
4
|
+
class Scraper
|
|
5
|
+
def we_work
|
|
6
|
+
agent = Mechanize.new
|
|
7
|
+
page = agent.get('https://weworkremotely.com/categories/2/jobs')
|
|
8
|
+
page.links_with( :href => %r{/jobs/} ).each_with_index do |link, index|
|
|
9
|
+
next if link.href == 'https://weworkremotely.com/jobs/new'
|
|
10
|
+
page = link.click
|
|
11
|
+
doc = page.parser
|
|
12
|
+
extracted_words = doc.css('title').to_s
|
|
13
|
+
extracted_words << doc.css('div.listing-container').to_s
|
|
14
|
+
save extracted_words, index
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def save doc, index
|
|
19
|
+
File.open "postings/job_posting#{index}.txt", 'w' do |f|
|
|
20
|
+
f.write doc
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
data/lib/version.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
VERSION = '0.0.1'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
<div><ul><li><p> ninja <em>10x</em> wanted for ninja douchenozzle team</p></li></ul></div>
|
data/test/test_parser.rb
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
require 'minitest/autorun'
|
|
2
|
+
require_relative '../lib/parser'
|
|
3
|
+
|
|
4
|
+
class TestParser < Minitest::Test
|
|
5
|
+
def setup
|
|
6
|
+
@parser = Parser.new 'test/files/job_posting.txt'
|
|
7
|
+
@hash = { '10x' => 1, 'ninja' => 3, 'douchenozzle' => 2 }
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def test_that_HTML_tags_are_stripped
|
|
11
|
+
assert_equal 'ninja 10x wanted for ninja douchenozzle team', @parser.strip_tags
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def test_that_words_are_extracted
|
|
15
|
+
assert_equal ["ninja", "10x", "wanted", "for", "ninja", "douchenozzle", "team"], @parser.extract_words
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def test_that_words_are_counted
|
|
19
|
+
array = ['10x', 'ninja', 'douchenozzle', 'ninja', 'douchenozzle', 'ninja']
|
|
20
|
+
assert_equal @hash, @parser.count_words(array)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def test_that_the_hash_is_ordered
|
|
24
|
+
array = [['10x', 1], ['douchenozzle', 2], ['ninja', 3]]
|
|
25
|
+
assert_equal array, @parser.order_hash(@hash)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def test_that_nouns_are_extracted
|
|
29
|
+
assert_equal ["team"], @parser.extract_nouns
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
metadata
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: juxtaparssionate
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Thomas Osborn
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2015-04-13 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: treat
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ! '>='
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ! '>='
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rwordnet
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ! '>='
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ! '>='
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: nokogiri
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ! '>='
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ! '>='
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: mechanize
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - ! '>='
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0'
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ! '>='
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: sanitize
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - ! '>='
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0'
|
|
76
|
+
type: :runtime
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ! '>='
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0'
|
|
83
|
+
description: juxtaparssionate parses and compares job postings from various popular
|
|
84
|
+
job boards. By comparing word frequency, sentence length, and parts-of-speech counts,
|
|
85
|
+
juxtaparssionate produces metrics loosely determining the quality and uniqueness
|
|
86
|
+
of a job posting
|
|
87
|
+
email: trosborn@gmail.com
|
|
88
|
+
executables: []
|
|
89
|
+
extensions: []
|
|
90
|
+
extra_rdoc_files: []
|
|
91
|
+
files:
|
|
92
|
+
- README.rdoc
|
|
93
|
+
- lib/parser.rb
|
|
94
|
+
- lib/scraper.rb
|
|
95
|
+
- lib/version.rb
|
|
96
|
+
- test/files/job_posting.txt
|
|
97
|
+
- test/test_parser.rb
|
|
98
|
+
homepage: http://www.github.com/trosborn/juxtaparssionate
|
|
99
|
+
licenses:
|
|
100
|
+
- MIT
|
|
101
|
+
metadata: {}
|
|
102
|
+
post_install_message:
|
|
103
|
+
rdoc_options: []
|
|
104
|
+
require_paths:
|
|
105
|
+
- lib
|
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - ! '>='
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '0'
|
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
|
+
requirements:
|
|
113
|
+
- - ! '>='
|
|
114
|
+
- !ruby/object:Gem::Version
|
|
115
|
+
version: '0'
|
|
116
|
+
requirements: []
|
|
117
|
+
rubyforge_project:
|
|
118
|
+
rubygems_version: 2.4.5
|
|
119
|
+
signing_key:
|
|
120
|
+
specification_version: 4
|
|
121
|
+
summary: A tool for parsing and comparing job postings
|
|
122
|
+
test_files: []
|